Home Building DHTML Scripting Using Diaperglu Documention Key Script Commands Reference C Library API Reference Handy References About License Contact Forth Draft Standard
How to use Compiling call n Making .o files Making .dylib files Making flat .o bufs Platform Independence Hello World 64 exe Diaperglu linking Diaperglu Life Syntax Addressing modes Calling C on Mac Calling C on Win64 P.I.C. Flow control Reference Docs x86 words AAA, AAD, AAM, AAS, ADC, ADCX, ADD, ADDPD, ADDPS, ADDSD, ADDSS, ADDSUBPD, ADDSUBPS, ADOX, AESDEC, AESDECLAST, AESENC, AESENCLAST, AESKEYGENASSIST, AND, ANDN, ANDNPD, ANDNPS, ANDPD, ANDPS, ARPL, BEXTR, BLENDPD, BLENDPS, BLENDVPD, BLENDVPS, BLSI, BLSMSK, BLSR, BOUND, BSF, BSR, BSWAP, BT, BTC, BTR, BTS, BZHI, CALL, CALL[SS]+N16, CALL[SS]+N32, CALL[SS]+N64, CBW, CDQ, CDQE, CLAC, CLC, CLD, CLI, CLTS, CMC, CMOV, CMP, CMPPD, CMPPS, CMPSB, CMPSD, CMPSD2, CMPSQ, CMPSS, CMPSW, CMPXCHG, CMPXCHG16B, CMPXCHG8B, COMISD, COMISS, CPUID, CQO, CRC32, CVTDQ2PD, CVTDQ2PS, CVTPD2DQ, CVTPD2PI, CVTPD2PS, CVTPI2PD, CVTPI2PS, CVTPS2DQ, CVTPS2PD, CVTPS2PI, CVTSD2SI, CVTSD2SS, CVTSI2SD, CVTSI2SS, CVTSS2SD, CVTSS2SI, CVTTPD2DQ, CVTTPD2PI, CVTTPS2DQ, CVTTPS2PI, CVTTSD2SI, CVTTSS2SI, CWD, CWDE, DAA, DAS, DEC, DIV, DIVPD, DIVPS, DIVSD, DIVSS, DPPD, DPPS, EMMS, ENTER, ENTER-RBP-FRAME, EXTRAXTPS, F2XM1, FABS, FADD->ST0, FADDST0-> FADDPST0-> FIADD->ST0, FBL, FBSTP, FCHS, FCLEX, FNCLEX, FCMOVB, FCMOVBE, FCMOVE, FCMOVU, FCMOVNB, FCMOVNBE, FCMOVNE, FCMOVNU, FCOM->ST0, FCOMP->ST0, FCOMPP, FCOMI->ST0, FCOMIP->ST0, FUCOMI->ST0, FUCOMIP->ST0, FCOS, FDECSTP, FDIV->ST0, FDIVST0->, FDIVPST0->, FIDIV->ST0, FDIVR->ST0, FDIVRST0->, FDIVRPST0->, FIDIVR->ST0, FFREE, FICOM->ST0, FICIOMP->ST0, FILD, FINCSTP, FINIT, FNINIT, FIST, FISTP, FISTTP, FLD, FLD1, FLD2T, FLD2E, FLDPI, FLDG2, FLDN2, FLDZ, FLDCW, FLDENV, FMUL->ST0, FMULST0->, FMULPST0->, FIMUL->ST0, FNOP, FPATAN, FPREM, FPREM1, FPTAN, FRAME,> FRAME> FRNDINT, FRSTOR, FSAVE, FNSAVE, FSCALE, FSIN, FSINCOS, FSQRT, FST, FSTP, FSTCW, FNSTCW, FSTENV, FNSTENV, FSTSW, FNSTSW, FSUB->ST0, FSUBST0->, FSUBPST0->, FISUB->ST0, FSUBR->ST0, FSUBRST0->, FSUBRST0->, FISUBR->ST0, FTST, FUCOM->ST0, FUCOMP->ST0, FUCOMPP, FXAM, FXCH, FXCHST0->, FXTRACT, FYL2X, FYL2XP1, FDUP, HADDPD, HADDPS, HLT, HSUBPD, HSUBPS, IDIV, IMULA, IMUL, IMULN, IN[DX]->AL, IN[DX]->AX, IN[DX]->EAX, IN[N8]->AL, IN[N8]->AX, IN[N8]->EAX, INC, INSB, INSD, INSW, INSERTPS, INT, INT3, INTO, INVD, INVLPG, INVPCID, IRETD, IRETQ, JMP, JMP[SS]+N16, JMP[SS]+N32, JMP[SS]+N64, LAHF, LAR, LDDQU, LDMXCSR, LDS, LES, LFS, LGS, LSS, LEA, LEAVE, LFENCE, LGDT, LIDT, LLDT, LMSW, LOCAL-CELLS-ALLOCATE, LOCAL-[RBP+N]> LOCK, LODSB, LODSD, LODSQ, LODSW, LTR, LZCNT, MASKMOVDQU, MASKMOVQ, MAXPD, MAXPS, MAXSD, MAXSS, MFENCE, MINPD, MINPS, MINSD, MINSS, MONITOR, MOV, MOV[N]->AL, MOV[N]->AX, MOV[N]->EAX, MOV[N]->RAX, MOVAL->[N] MOVAX->[N] MOVEAX->[N] MOVRAX->[N] MOVCR, MOVD, MOVQ2, MOVDDUP, MOVDQ2Q, MOVDR, MOVSR, MOVAPD, MOVAPS, MOVBE, MOVDQA, MOVDQU, MOVHLPS, MOVHPD, MOVHPS, MOVLHPS, MOVLPD, MOVLPS, MOVMSKPD, MOVMSKPS, MOVNTDQ, MOVNTDQA, MOVNTI, MOVNTPD, MOVNTPS, MOVQ, MOVQ2DQ, MOVSB, MOVSD, MOVSQ, MOVSW, MOVSD2, MOVSHDUP, MOVSLDUP, MOVSS, MOVSX, MOVSXD, MOVUPD, MOVUPS, MOVZX, MPSADBW, MUL, MULPD, MULPS, MULSD, MULSS, MULX, MWAIT, NEG, NOP, NOT, OR, ORPD, ORPS, AL->OUT[N8] AX->OUT[N8] EAX->OUT[N8] AL->OUT[DX] AX->OUT[DX] EAX->OUT[DX] OUTSB, OUTSD, OUTSW, PABSB, PABSD, PABSW, PACKSSWB, PACKSSDW, PACKUSDW, PACKUSWB, PADDB, PADDD, PADDQ, PADDW, PADDSB, PADDSW, PADDUSB, PADDUSW, PALIGNR, PAND, PANDN, PAUSE, PAVGB, PAVGW, PBLENDVB, PLENDW, PCLMULQDQ, PCMPEQB, PCMPEQD, PCMPEQQ, PCMPEQW, PCMPESTRI, PCMPESTRM, PCMPGTB, PCMPGTD, PCMPGTQ, PCMPGTW, PCMPISTRI, PCMPISTRM, PDEP, PEXT, PEXTRB, PEXTRD, PEXTRQ, PEXTRW, PHADDD, PHADDW, PHADDSW, PHMINPOSUW, PHSUBD, PHSUBW, PHSUBSW, PINSRB, PINSRD, PINSRQ, PINSRW, PMADDUBSW, PMADDWD, PMAXSB, PMAXSD, PMAXSW, PMAXUB, PMAXUD, PMAXUW, PMINSB, PMINSD, PMINSW, PMINUB, PMINUD, PMINUW, PMOVMSKB, PMOVSXBW, PMOVSXBD, PMOVSXBQ, PMOVSXDQ, PMOVSXWD, PMOVSXWQ, PMOVZXBW, PMOVZXBD, PMOVZXBQ, PMOVZXDQ, PMOVZXWD, PMOVZXWQ, PMULDQ, PMULHRSW, PMULHUW, PMULHW, PMULLD, PMULLW, PMULUDQ, POP, POPAD, POPCNT, POPF, POPFD, POPFQ, POR, PREFETCHNTA, PREFETCHT0, PREFETCHT1, PREFETCHT2, PREFETCHW, PREFETCHWT1, PSADBW, PSHUFB, PSHUFD, PSHUFHW, PSHUFLW, PSHUFW, PSIGNB, PSIGND, PSIGNW, PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLDQ, PSRLD, PSRLQ, PSRLW, PSUBB, PSUBD, PSUBQ, PSUBW, PSUBSB, PSUBSW, PSUBUSB, PSUBUSW, PTEST, PUNPCKHBW, PUNPCKHWD, PUNPCKHDQ, PUNPCKHQDQ, PUNPCKLBW, PUNPCKLWD, PUNPCKLDQ, PUNPCKLQDQ, PUSH, PUSHAD, PUSHF, PUSHFD, PUSHFQ, PXOR, RCL, RCR, RCPPS, RCPSS, RDFSBASE, RDGSBASE, RDMSR, RDPMC, RDRAND, RDSEED, RDTSC, RDTSCP, REP, REPE, NZUNTILREP, REPNE, ZSUNTILREP, RET, RETDROPN16, RETFAR, RETFARDROPN16, ROL, ROR, RORX, ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS, RSM, RSQRTPS, RSQRTSS, SAHF, SAL, SAR, SARX, SBB, SCASB, SCASD, SCASQ, SCASW, SET, SFENCE, SGDT, SHL, SHLD, SHLX, SHR, SHRD, SHRX, SHUFPD, SHUFPS, SIDT, SLDT, SMSW, SQRTPD, SQRTPS, SQRTSD, SQRTSS, STAC, STC, STD, STI, STMXCSR, STOSB, STOSD, STOSW, STOSQ, STR, SUB, SUBPD, SUBPS, SUBSD, SUBSS, SWAPGS, SYSCALL, SYSENTER, SYSEXIT, SYSRETTO32, SYSRETTO64, TEST, UCOMISD, UCOMISS, UD2, UNPCKHPD, UNPCKHPS, UNPCKLPD, UNPCKLPS, VADDPD, VADDPS, VADDSD, VADDSS, VADDSUBPD, VADDSUBPS, VAESDEC, VAESDECLAST, VAESENC, VAESENCLAST, VAESIMC, VAESKEYGENASSIST, VANDNPD, VANDNPS, VANDPD, VANDPS, VBLENDPD, VBLENDPS, VBLENDVPD, VBLENDVPS, VBROADCASTF128, VBROADCASTSD, VBROADCASTSS, VCMPPD, VCMPPS, VCMPSD, VCMPSS, VCOMISD, VCOMISS, VCVTDQ2PD, VCVTDQ2PS, VCVTPD2DQ, VCVTPH2PS, VCVTPS2DQ, VCVTPS2PD, VCVTPS2PH, VCVTSD2SI, VCVTSD2SS, VCVTSI2SD, VCVTSI2SS, VCVTSS2SD, VCVTSS2SI, VCVTTPD2DQ, VCVTTPS2DQ, VCVTTSD2SI, VCVTTSS2SI, VDIVPD, VDIVPS, VDIVSD, VDIVSS, VDPPD, VDPPS, VERR, VERW, VEXTRACTF128, VEXTRACTI128, VEXTRACTPS, VFMADD132PD, VFMADD213PD, VFMADD231PD, VFMADD132PS, VFMADD213PS, VFMADD231PS, VFMADD132SD, VFMADD213SD, VFMADD231SD, VFMADD132SS, VFMADD213SS, VFMADD231SS, VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD, VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS, VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD, VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS, VFMSUB132PD, VFMSUB213PD, VFMSUB231PD, VFMSUB132PS, VFMSUB213PS, VFMSUB231PS, VFMSUB132SD, VFMSUB213SD, VFMSUB231SD, VFMSUB132SS, VFMSUB213SS, VFMSUB231SS, VFNMADD132PD, VFNMADD213PD, VFNMADD231PD, VFNMADD132PS, VFNMADD213PS, VFNMADD231PS, VFNMADD132SD, VFNMADD213SD, VFNMADD231SD, VFNMADD132SS, VFNMADD213SS, VFNMADD231SS, VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD, VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS, VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD, VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS, VGATHERDPD, VGATHERDPS, VGATHERQPD, VGATHERQPS, VHADDPD, VHADDPS, VHSUBPD, VHSUBPS, VINSERTF128, VINSERTI128, VINSERTPS, VLDDQU, VLDMXCSR, VMASKMOVDQU, VMASKMOVPD, VMASKMOVPS, VMAXPD, VMAXPS, VMAXSD, VMAXSS, VMINPD, VMINPS, VMINSD, VMINSS, VMOVAPD, VMOVAPS, VMOVD, VMOVDDUP, VMOVDQA, VMOVDQU, VMOVHLPS, VMOVHPD, VMOVHPD2, VMOVHPS, VMOVHPS2, VMOVLHPS, VMOVLPD, VMOVLPD2, VMOVLPS, VMOVLPS2, VMOVMSKPD, VMOVMSKPS, VMOVNTDQ, VMOVNTDQA, VMOVNTPD, VMOVNTPS, VMOVQ, VMOVQ2, VMOVSD, VMOVSD2, VMOVSHDUP, VMOVSLDUP, VMOVSS, VMOVSS2, VMOVUPD, VMOVUPS, VMPSADBW, VMULPD, VMULPS, VMULSD, VMULSS, VORPD, VORPS, VPABSB, VPABSD, VPABSW, VPACKSSDW, VPACKSSWB, VPACKUSDW, VPACKUSWB, VPADDB, VPADDD, VPADDQ, VPADDSB, VPADDSW, VPADDUSB, VPADDUSW, VPADDW, VPALIGNR, VPAND, VPANDN, VPAVGB, VPAVGW, VPBLENDD, VPBLENDVB, VPBLENDW, VPBROADCASTB, VPBROADCASTD, VPBROADCASTI128, VPBROADCASTQ, VPBROADCASTW, VPCLMULQDQ, VPCMPEQB, VPCMPEQD, VPCMPEQQ, VPCMPEQW, VPCMPESTRI, VPCMPESTRM, VPCMPGTB, VPCMPGTD, VPCMPGTQ, VPCMPGTW, VPCMPISTRI, VPCMPISTRM, VPERM2F128, VPERM2I128, VPERMD, VPERMILPD, VPERMILPS, VPERMPD, VPERMPS, VPERMQ, VPEXTRB, VPEXTRD, VPEXTRQ, VPEXTRW, VPEXTRW2, VPGATHERDD, VPGATHERDQ, VPGATHERQD, VPGATHERQQ, VPHADDD, VPHADDSW, VPHADDW, VPHMINPOSUW, VPHSUBD, VPHSUBSW, VPHSUBW, VPINSRB, VPINSRD, VPINSRQ, VPINSRW, VPMADDUBSW, VPMADDWD, VPMASKMOVD, VPMASKMOVQ, VPMAXSB, VPMAXSD, VPMAXSW, VPMAXUB, VPMAXUD, VPMAXUW, VPMINSB, VPMINSD, VPMINSW, VPMINUB, VPMINUD, VPMINUW, VPMOVMSKB, VPMOVSXBD, VPMOVSXBQ, VPMOVSXBW, VPMOVSXDQ, VPMOVSXWD, VPMOVSXWQ, VPMOVZXBD, VPMOVZXBQ, VPMOVZXBW, VPMOVZXDQ, VPMOVZXWD, VPMOVZXWQ, VPMULDQ, VPMULHRSW, VPMULHUW, VPMULHW, VPMULLD, VPMULLW, VPMULUDQ, VPOR, VPSADBW, VPSHUFB, VPSHUFD, VPSHUFHW, VPSHUFLW, VPSIGNB, VPSIGND, VPSIGNW, VPSLLD, VPSLLDQ, VPSLLQ, VPSLLVD, VPSLLVQ, VPSLLW, VPSRAD, VPSRAVD, VPSRAW, VPSRLD, VPSRLDQ, VPSRLQ, VPSRLVD, VPSRLVQ, VPSRLW, VPSUBB, VPSUBD, VPSUBQ, VPSUBSB, VPSUBSW, VPSUBUSB, VPSUBUSW, VPSUBW, VPTEST, VPUNPCKHBW, VPUNPCKHDQ, VPUNPCKHQDQ, VPUNPCKHWD, VPUNPCKLBW, VPUNPCKLDQ, VPUNPCKLQDQ, VPUNPCKLWD, VPXOR, VRCPPS, VRCPSS, VROUNDPD, VROUNDPS, VROUNDSD, VROUNDSS, VRSQRTPS, VRSQRTSS, VSHUFPD, VSHUFPS, VSQRTPD, VSQRTPS, VSQRTSD, VSQRTSS, VSTMXCSR, VSUBPD, VSUBPS, VSUBSD, VSUBSS, VTESTPD, VTESTPS, VUCOMISD, VUCOMISS, VUNPCKHPD, VUNPCKHPS, VUNPCKLPD, VUNPCKLPS, VXORPD, VXORPS, VZEROALL, VZEROUPPER, WAIT, WBINVD, WRFSBASE, WRGSBASE, WRMSR, XABORT, XACQUIRE, XADD, XBEGINBRANCH, XBEGINN16, XBEGINN32, XCHG, XEND, XGETBV, XLATB, XOR, XORPD, XORPS, XRELEASE, XRSTOR, XRSTOR64, XRSTORS, XRSTORS64, XSAVE, XSAVE64, XSAVEC, XSAVEC64, XSAVEOPT, XSAVEOPT64, XSAVES, XSAVES64, XSETBV, XTEST, CODE OCODE END-CODE BEGIN, UNTIL, IF, ELSE, THEN, WHILE, REPEAT, LOOPNOTDONEWHILE, NZORLOOPNOTDONEWHILE, NEORLOOPNOTDONEWHILE, ZSORLOOPNOTDONEWHILE, EQORLOOPNOTDONEWHILE, LOOPDONEUNTIL, ZSORLOOPDONEUNTIL, EQORLOOPDONEUNTIL, NEORLOOPDONEUNTIL, NZORLOOPDONEUNTIL, N [R] [R+N] [N] [MOD] [R+S*R+N] [SIB] [O] O IMP OIMPORTCODELINK FRAME-PARAM NO-FRAME-PARAM 8BIT 16BIT 32BIT 64BIT 80BIT IMMEDIATE X86-WORDLIST O->RMOV, [O]->RMOV, OPUSH, [O]PUSH, [O]POP, $>P$PUSH, N->EAX, C Functions dg_compileloopwhilecomma dg_checkifvaluefits dg_isrexbaseorindexreg dg_isrexsrcordestreg dg_isdefaultaddresssizeonetarget dg_compilemovntorax dg_compilemovntoeax dg_compilesubn8fromrsp dg_compileaddn8torsp dg_compilepushn64toret dg_compilesubntorsp dg_compileaddntorsp dg_compilemovntoreg dg_compilemovregtoreg dg_compilemovfregtofreg dg_compileaddregtoreg dg_compilemovbracketrsptoreg dg_compilemovbracketrsptofreg dg_compilemovbracketrspd8toreg dg_compilemovbracketrspd8tofreg dg_compilemovbracketrbpd8toreg dg_compilemovbracketrspd32toreg dg_compilemovbracketrspd32tofreg dg_compilemovbracketrbpd32toreg dg_compilemovbracketrspdtoreg dg_compilemovregtobracketrsp dg_compilemovfregtobracketrsp dg_compilemovregtobracketrspd8 dg_compilemovfregtobracketrspd8 dg_compilemovregtobracketrbpd8 dg_compilemovregtobracketrspd32 dg_compilemovregtobracketrspd dg_compilemovfregtobracketrspd32 dg_compilemovregtobracketrbpd32 dg_compilemovregtobracketrbpd dg_compilepushbracketrbpd8 dg_compiledatalink dg_compilejumptorax dg_compilecalltorax dg_compilereturn dg_compileclc dg_compilestc dg_compilecalloffset dg_compilejmpoffset dg_compilejmpbracketoffset dg_compileacopyofsscopyto dg_compilecopystonewstring/a> dg_compilecalloffsetinsamebuffer dg_compilecalladdress dg_compilecallfunctblfunction dg_forthdocompiletypecall dg_packmodrslashm dg_packsib dg_packrex dg_compilealignretstack dg_forthcompilealignretforn dg_compilepushpBHarrayheadtoret dg_compilecallcore dg_compilecallftcolon dg_compilepushdatastack dg_compilepushntodatastack dg_compilobtoptodatastack ENTER-DGLU-FRAME, ENTER-CALL-SUBS-FRAME, COMPILE-ENTER-FRAME EXIT-DGLU-FRAME, EXIT-CALL-SUBS-FRAME, EXIT-CALL-SUBS-FRAME-NO-RET, COMPILE-EXIT-FRAME dg_compilepusholderrorcounttoret dg_compilequeryerror dg_forthcompilealignretfornpf dg_showframe dg_compilepushregtoret dg_compilepopregfromret dg_compilebranch dg_resolvecompiledbranch dg_resolvecompiled8bitbranch dg_compilecompare dg_compilepushntoret dg_forththen dg_compilentoparameter dg_forthcompilesafecallbuffer dg_initjumpbuffer dg_initSibformatter dg_bumpdisplacementsizeifneeded dg_setmod dg_formatsib dg_formatmodrslashm dg_formatreg dg_calculatemodrslashm dg_calculatesib dg_getsizefromreg dg_checkbasereg dg_checkindexreg dg_pullmemusingsib dg_pullmemusingrslashm dg_getcallsubsframepreservedregoffset dg_pulloneaddressingmode dg_compiledisplacement dg_formatpsf dg_compilepsf dg_queryisrega dg_gettargettype dg_determine2targettype dg_compilerexnosizetargetreg dg_compilerexnotargetreg dg_compilerex dg_compilen8tom8 dg_compilen16tom16 dg_compilen32tom32 dg_compilen8tom32 dg_compilen8tom16 dg_compilentom dg_compileopcodeplusropstr dg_compilemem8 dg_compilereg8 dg_compilerega8 dg_compilen8 dg_compilen16 dg_compilen16signextended dg_compilen32 dg_compilen32signextended dg_compilemem16 dg_compilereg16 dg_compilerega16 dg_compilemem32 dg_compilereg32 dg_compilerega32 dg_compilentarget dg_compileregatarget dg_compileregtarget dg_compilememtarget dg_compileonetarget dg_fillonetargetmemonlyoptable dg_compilen8tor8 dg_compilen16tor16 dg_compilen32tor32 dg_compilentor dg_compilen8toa8 dg_compilen16toa16 dg_compilen32toa32 dg_compilentoa dg_compilertom dg_compiletwotargets dg_fill2targetmathoptbl dg_fill2targetadcxoptbl dg_fill2targetmovoptbl dg_fill2targettestoptbl dg_fill2targetmemonlyoptbl dg_fill2targetmembonlyoptbl dg_fill2targetmem32onlyoptbl dg_fill2targetm32bonlyoptbl dg_pullandcompiletwotargets dg_getshiftopcodetype dg_pullandcompileshiftop dg_compilebitoprofr dg_compilebitoprofm dg_compilebitopnofr dg_compilebitopnofm dg_compilebitop dg_compiledshiftoprtom dg_compiledshiftop FRAME-PARAMS< NO-FRAME-PARAMS< CALL-SUBS-FRAME-LOCALS,< DGLU-FORTH-FRAME-LOCALS,< (( )), REGS< CALL-SUBS-FRAME-FAST-LOCALS,< $-RMASK $SRC-RMASK $DEST-RMASK $CTR-RMASK MATH-HI-RMASK MUST-BE-PRESERVED-RMASK IMUST-BE-PRESERVED-RMASK FMUST-BE-PRESERVED-RMASK U-IMUST-BE-PRESERVED-RMASK U-FMUST-BE-PRESERVED-RMASK IPARAMS-RMASK FPARAMS-RMASK PARAMS-RMASK ALLOCATABLE-RMASK IALLOCATABLE-RMASK FALLOCATABLE-RMASK IPARAM0-RMASK IPARAM1-RMASK IPARAM2-RMASK IPARAM3-RMASK IPARAM4-RMASK IPARAM5-RMASK IPARAM6-RMASK IPARAM7-RMASK FPARAM0-RMASK FPARAM1-RMASK FPARAM2-RMASK FPARAM3-RMASK FPARAM4-RMASK FPARAM5-RMASK FPARAM6-RMASK FPARAM7-RMASK AVAILABLE-RMASK PRSDEPTH PPRESERVED-DEPTH PPRESERVED-RMASK PUSED-RMASK R>RMASKPOS RMASKPOS>R RMASK-NO-FRAME-PRESERVE, RMASK-NO-FRAME-UNPRESERVE, RMASK-CALL-SUBS-FRAME-PRESERVE, RMASK-CALL-SUBS-FRAME-UNPRESERVE, UI-UF-MAKE-SURE-AVAILABLE-RMASK RMASK-USE RMASK-UNUSE R-USE UNUSED-IR-USE UNUSED-FR-USE IPARAM0 IPARAM1 IPARAM2 IPARAM3 SHADOWSIZE

Using The Assembler (4/11/2022)

These instructions are for 64 bit Mac OS X. This version of the assembler has almost all of the x86 instructions. Not all instructions are supported on every processor. Which instructions work on which processor is not documented here yet.

These examples use the Diaperglu script engine. You will need to build Diaperglu if it isn't already built. If you need to build diaperglu do this:

  • Open a terminal.
  • Change (cd) to the /DiaperGlu64/MaxOsX12.6.3 subdirectory
  • Execute this command: sudo make

sudo is required for newer versions of Mac Os X because make builds and writes a .dylib file

The assembler compiles instructions for an x86 processor in 64 bit mode by default. You can switch the assembler to 32 bit mode by doing this:

4 PX86ASMADDRESSSIZE !

Just be warned that I have not tested 32 bit mode since I switched over to 64 bit mode so some stuff probably will not work in 32 bit mode.


Basic form of assembler functions using the CODE and END-CODE method:

CODE myassemblerfunctionname
instructions
END-CODE

CODE makes a new word in the current new word wordlist with a compile type of compile a subroutine call. This means if you use the word myassemblerfunctionname in execute mode from the command line, it will try to compile a subroutine call to the myassemblerfunctionname function. In Diaperglu, the symbol information for words is separate from the compiled code, the current compile buffer only gets the new compiled code and nothing else.

Example of an assembler routine which passes a parameter in RAX. This does not conform to the C standard calling convention. You can realistically only call this from other assembler routines:

CODE incrementrax
RAX INC,
RET,
END-CODE


Example of an assembler routine which uses the C standard calling convention:

CODE addtwoparameters
RDI RAX -> MOV, // parameter0
RSI RAX -> ADD, // parameter1
RET,
END-CODE

Mac Os X, 64 bit parameter passing follows the System V ABI calling convention. In this calling convention, the first 6 integer parameters are passed in registers. (RDI RSI RDX RCX R8 R9). Integer parameters after the first 6 are passed on the return stack. Integer results that fit into 64 bits are returned in RAX. 128 bit integer results are returned in RDX (hi 64 bits) and RAX (lo 64 bits). The first 8 floating point parameters are passed in registers. (XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7) Floating point parameters after the first 8 are passed on the return stack after any integer parameters that are passed on the return stack. In C, the function prototype for the addtwoparameters function above is:
UINT64 addtwoparameters (UINT64 parameter0, UINT64 parameter1)

Registers rbp, rbx, r12, r13, r14, and r15 must be preserved.

Also, there is a 16 byte return stack pointer alignment requirement when calling certain subroutines like many Mac OS X library functions.

Diaperglu has a word that lets you call functions that pass integer parameters that conform to this calling convention called CALLPROC. To use CALLPROC you push the parameters in reverse order onto the data stack, followed by the number of parameters, then the address of the function. After CALLPROC the return stack has the low 64 bits of the return value. Example of calling a C standard (System V ABI) function using Diaperglu script:

DECIMAL 27 39 2 ' addtwoparameters >BODY CALLPROC U.

You should see 66 printed out.

' finds the execute token of the next word in the input stream. >BODY gets the current data address of an execute token. Since it's an address from a buffer you are compiling to, it is only valid until the next time the buffer moves, which may happen the next time you compile something unless you made a buffer that is of fixed size and won't move. This means you have to use the address you get from >BODY right away if the address is in a buffer that can move.

Example of calling addtwoparameters from another CODE routine:

CODE calladdtwoparameters
39 N RSI MOV, // parameter1
27 N RDI MOV, // parameter0
addtwoparameters // compiles a pc relative call to addtwoparameters
RET,
END-CODE

And how to call this calladdtwoparameters:

0 ' calladdtwoparameters >BODY CALLPROC U.

You should see 66 printed out.

Note that addtwoparameters and calladdtwoparameters have to be in the same compile buffer because this is a pc relative call. This type of call will work even if the buffer moves to a different base address.

More information on the C standard calling convention can be found here: Parameter passing
and here: System V ABI

There are also commands to call other types of functions. If you want to call a function which passes both int and floating point parameters, and returns a floating point, use CALLDFPPROC If you want to call a function which takes integer parameters and returns a UINT128, use CALLPROCRETUINT128


If you would like to see examples of most of the x86 assembly compiling words being used, you can look at the test scripts in these files:

  • /MaxOsX12.6.3/samplescripts/testcodex86.dglu
  • /MaxOsX12.6.3/samplescripts/testcodex86b.dglu
  • /MaxOsX12.6.3/samplescripts/testcodex86vex.dglu
  • /MaxOsX12.6.3/samplescripts/testcodex86frames.dglu

If you would like to run the test scripts, you will need a processor that can do all the latest extensions. Eventually I may sort the tests by processor technology. This will run the test scripts.

  • Open a terminal.
  • Change (cd) to the /DiaperGlu64/MaxOsX12.6.3/samplescripts subdirectory
  • Execute these commands:
  • ../diaperglu testcodex86.dglu
  • ../diaperglu testcodex86b.dglu
  • ../diaperglu testcodex86vex.dglu
  • ../diaperglu testcodex86frames.dglu

Compiling a call to an absolute address

It is also possible to do a call to an absolute address. (Absolute address is assembly language jargon for pointer.) Keep in mind that if you are using an address, it is only valid until the next time the buffer moves which may happen the next time you compile something, which could be THIS time you compile something. So realistically you can only use this technique to call functions in a buffer that is of fixed size and does not move. This also means you can not use this technique to compile code you are going to save to a file and load later without doing anything to fix the addresses. If you are compiling .o files, you also would have to fix the address after it loads. So this technique is probably only good for when you do run time dynamic linking using dlopen and dlsym. When you dlopen a library it is loaded into a buffer that won't move. And the addresses you get using dlsym won't change until the library is freed. With that in mind, if you want to call an absolute address you first have to move the address into a register, and then call the register, like this:

HEX 1000 1000 NEWBUFFER CONSTANT myfixedaddressbufferid // making a new fixed size buffer

myfixedaddressbufferid PCURRENTCOMPILEBUFFER ! // making it the current compile buffer

CODE addtwoparametersb
RDI RAX -> MOV, // parameter0
RSI RAX -> ADD, // parameter1
RET,
END-CODE

CODE calladdtwoparametersbabsolute
39 N RSI MOV, // parameter1
27 N RDI MOV, // parameter0
' addtwoparametersb >BODY N RAX MOV, // gets the address of addtwoparametersb into RAX
RAX CALL, // compiles a call to RAX
RET,
END-CODE

0 ' calladdtwoparametersbabsolute >BODY CALLPROC U.

Shared libraries and dlls don't change address once you load them, so it's possible to use this technique to compile assembly calls to shared library functions. For example, on Mac OS X:

$" libsystem.dylib" OPENLIBRARY$ CONSTANT mylibsystemhandle DROP

$" write" mylibsystemhandle FINDLIBRARYSYMBOL CONSTANT pwrite

PCURRENTCOMPILEBUFFER @ GETSBUFFER + CONSTANT pmymessage

$" hello world!" LENGTH$ CONSTANT mymessagelength
PCURRENTCOMPILEBUFFER @ $>BUF

CODE callsystemwrite
RBP PUSH,
RSP RBP MOV,
HEX -10 N RSP AND, // align the return stack for 0 stack parameters
1 N RDI MOV, // handle to stdout
mymessagelength N RDX MOV,
pmymessage N RSI MOV,
pwrite N RAX MOV,
RAX CALL,
RBP RSP MOV,
RBP POP,
RET,
END-CODE

0 ' callsystemwrite >BODY CALLPROC DROP

Since Diaperglu comes as a shared library, you can use this technique to call Diaperglu's C functions from assembly... with the whole you can't save the compiled code to a file and run it later without fixing the addresses thing.

Making Diaperglu script words using the assembler.

This function uses the Diaperglu standard frame and pBHarrayhead parameter.

CODE +U.
ENTER-DGLU-FRAME,
] + U. [
EXIT-DGLU-FRAME,
END-CODE

ENTER-DGLU-FRAME, and EXIT-DGLU-FRAME, are required because they keep track of the pBHarrayhead pointer to the Diaperglu instance and the current error count among other things. The right bracket tells Diaperglu to go into compile script mode. Plus and Udot are two Diaperglu script commands. The left bracket puts Diaperglu back into execute script mode.

If you try to execute the new +U. command right now, it will compile a call to +U. which is probably not what you want. To change the compile type of an Diaperglu script word compiled with assembly so you can use it from the command line easily do this:

OCREXECUTE ' +U. SETOCR

Calling +U. from a Diaperglu script file is no problem. Do this:

+U.

But there is a problem. The calls to + and U. are not pc relative. This means you can not save the +U. code to a file and load it later because the addresses of + and U. will likely change after you exit Diaperglu and restart. Even if the new Mac Os thing about apps always starting at address 0 is true, there is always the possibility you may be running your code on a newer version of Diaperglu, so it's not a good idea to use absolute addresses unless you know they aren't going to change. If you want to compile assembly code for shared libraries and executables that calls Diaperglu functions, you will need to import the functions with the Diaperglu linker, or import the functions using ld. How to do imports, exports, and linking is shown in the examples. Keep in mind you have to follow the System V ABI calling convention requirements, including the one for aligning the return stack to a 16 byte boundary when you call operating system functions or gcc compiled code, like Diaperglu C functions. The Diaperglu script compiling mode shown in +U. does align the stack for the calls to U. and +. The examples in the Using The Assembler section do not because they don't have any instructions in them that require alignment.

Making a 64 bit exe hello world on Mac OS X

Building the example:

  • Open a terminal.
  • Use cd to change to the /MaxOsX12.6.3/samplescripts/helloworld64exe subdirectory
  • Execute this command: sudo make

This compiles an exe called helloworld64, which you can then run like this: ./helloworld64

The makehelloworld64doto.dglu script file makes a .o file which imports the system write and system exit functions, and exports one symbol. ld uses the exported symbol as the where to start symbol. I used the system write and exit functions instead of syscall because syscall is now deprecated on newer versions of Mac Os. What I think that means is they took support for the syscall instruction out completely and had the gas assembler compile imports to a function named syscall instead of compiling the syscall instruction.

If you want to distribute your new executable on Mac, you will need to be aware of Mac's Gatekeeper software. Here is a developer thread which explains things: https://developer.apple.com/forums/thread/706442. Basically, unless you want to put your app on the app store, or have it notarized, your end users will need to rebuild your app from source on their local machine. 2023 March 20 J.N.

Making a .o file

If you want to see an example of making a .o file do this:

  • Open a terminal. On 64 bit Windows you need to use the Visual Studio 64 bit x86 native tools Console.
  • On Mac OS X, use cd to change to the /MaxOsX12.6.3/samplescripts/testmakedoto subdirectory
  • On 64 bit Windows, use cd to change to the /Win64MSVC2019/samplescripts/testmakedoto subdirectory
  • On Mac OS X, execute this command: sudo make
  • On 64 bit Windows, execute this command: nmake

The Diaperglu word that makes .o files also uses the same symbol lists and raw code buffers the Diaperglu linker does. This examples builds a simple two function export symbol list and raw code buffer, makes a .o file, then calls ld to link them into a .dylib. It also builds a c program to call the .dylib functions.

This is an export only example. If you would like to see an example that uses ld to have one .o file import from another, do this:

  • Open a terminal. On 64 bit Windows you need to use the Visual Studio 64 bit x86 native tools Console.
  • On Mac OS X, use cd to change to the /MaxOsX12.6.3/samplescripts/asmimportusingld subdirectory
  • On 64 bit Windows, use cd to change to the /Win64MSVC2019/samplescripts/asmimportusingld subdirectory
  • On Mac OS X, execute this command: sudo make
  • On 64 bit Windows, execute this command: nmake

If you want to see an example of making .o files on Mac OS X that uses words to automate the process do this:

  • Open a terminal.
  • Use cd to change to the /MaxOsX12.6.3/samplescripts/testosymbolbuf subdirectory
  • Execute this command: sudo make

Making a .dylib

If you want to see an example of making a .dylib file on Mac OS X do this:

  • Open a terminal.
  • Use cd to change to the /MaxOsX12.6.3/samplescripts/testmakedylib subdirectory
  • Execute this command: sudo make

The word that makes .o files also uses the same symbol lists and raw code buffers the Diaperglu linker does. This examples builds a simple two function symbol list and raw code buffer, makes a .o file, then calls ld to link them into a .dylib. It also builds a c program to call the .dylib functions.

On Mac, and probably Windows too, the memory in the dylib is read only at run time. This means you can't put any variables in the code buffer. You don't want to do that anyways because it makes your code non-reentrant. You can put read only constants in there if you want which you can access using the [O] or RIP nn [R+N] addressing modes.

Making .o files using the osymbol helper words

Added: April 10, 2022 J.N.

If you want to see an example of making .o files on Mac OS X that uses words to automate the process do this:

  • Open a terminal.
  • Use cd to change to the /MaxOsX12.6.3/samplescripts/testosymbolbuf subdirectory
  • Execute this command: sudo make

This example makes two .o files. The first .o file is linked into and export only dylib. The second .o file is linked into an export dylib which imports the first dylib. This example also uses a number of helper words to make importing functions, exporting functions, using input parameters, using local variables, and calling functions easier.

These helper functions let you make .o files that import non hierarchical functions. This means you can't tell the linker which shared library or .o file the imported symbol comes from. This means each imported symbol name has to be unique across all shared libraries and .o files in your project.

The flat .o buf helper functions set up and use: one compile buffer, an export symbol list, an import symbol list, and a hierarchical list that holds the symbol lists. The helper functions also keep track of most of the information needed to clean up when you are done. The old current compile buffer is held on the r stack, and the parent element id and hlist id of the symbol lists are held on the eh stack.

These are the helper functions:

  • NEW-FLAT-OSYMBOL-BUF sets up compiling a new .o buffer with new export and import symbol lists using osymbols

  • FREE-FLAT-OSYMBOL-BUF cleans up from compiling a .o buffer using osymbols

  • ENTER-CALL-SUBS-FRAME, compiles a subroutine entry that uses a frame that supports locals and calling functions

  • EXIT-CALL-SUBS-FRAME, compiles exiting the call subs frame

  • ENTER-DGLU-FRAME, compiles a subroutine entry that uses the DiaperGlu Forth frame

  • EXIT-DGLU-FRAME, compiles exiting the DiaperGlu Forth frame

  • ENTER-FRAME, compiles a subroutine entry that uses just the frame register

  • EXIT-FRAME, compiles exiting a frame that used just the frame register

  • FRAME-PARAMS< creates temporary operating system independent words to use in place of the addressing mode targets needed to access parameters passed to a subroutine using one of the frames

  • NO-FRAME-PARAMS< creates temporary operating system independent words to use in place of the addressing mode targets needed to access parameters passed to a subroutine that is not using a frame

  • CALL-SUBS-FRAME-LOCALS,< creates temporary operating system independent words and compiles code to allocates storage for a set of local variables for a subroutine using the call subs frame

  • DGLU-FORTH-FRAME-LOCALS,< creates temporary operating system independent words and compiles code to allocate storage for a set of local variables for a subroutine using the DiaperGlu Forth frame

  • RMASK-CALL-SUBS-FRAME-PRESERVE, compiles code to save a list of registers, which can include parameters in registers, to the local return stack frame

  • RMASK-NO-FRAME-PRESERVE, compiles code to save a list of registers, which can include parameters in registers, to the return stack

  • RMASK-CALL-SUBS-FRAME-UNPRESERVE, compiles code to restore a list of registers from the local return stack frame which were previously preserved

  • RMASK-NO-FRAME-UNPRESERVE, compiles code to restore a list of registers from the return stack which were previously preserved

  • UI-UF-MAKE-SURE-AVAILABLE-RMASK generates a list of registers to preserve in order to make the desired number of registers available

  • RMASK-USE marks a list of currently unused registers as used

  • R-USE marks a currently unused register as used

  • REGS< assigns currently unused registers to symbol names, adds them to the locals wordlist, and marks them as used

  • UNUSED-FR-USE gets the next unused floating point register and marks it as used

  • UNUSED-IR-USE gets the next unused integer register and marks it as used

  • R>RMASKPOS gets the 0 based bit index of a register in an rmask

  • RMASKPOS>R converts the 0 based bit index in an rmask to a register

  • R>RMASK converts a register to an rmask representing that register

  • $-RMASK gets the rmask representing the registers used in x86 string operations which are RCX RDI and RSI

  • $SRC-RMASK gets the rmask representing the register used as the source in x86 string operations which is RSI

  • $DEST-RMASK gets the rmask representing the register used as the destination in x86 string operations which is RDI

  • $CTR-RMASK gets the rmask representing the register used as the counter in x86 string operations which is RCX

  • MATH-HI-RMASK gets the rmask representing the register used at the high result in x86 math operations like MUL, which is RDX

  • MUST-BE-PRESERVED-RMASK gets the rmask representing the list of registers which must be preserved across a subroutine call

  • IMUST-BE-PRESERVED-RMASK gets the rmask representing the list of integer registers which must be preserved across a subroutine call

  • FMUST-BE-PRESERVED-RMASK gets the rmask representing the list of floating point registers which must be preserved across a subroutine call

  • U-IMUST-BE-PRESERVED-RMASK gets an rmask representing u integer registers from the list of registers which must be preserved across a subroutine call

  • U-FMUST-BE-PRESERVED-RMASK gets an rmask representing u floating point registers from the list of registers which must be preserved across a subroutine call

  • IPARAMS-RMASK gets an rmask representing the list registers which can be used as integer paramters

  • FPARAMS-RMASK gets an rmask representing the list registers which can be used as floating point paramters

  • IPARAM0-RMASK gets an rmask representing the register which is used as the first integer parameter

  • IPARAM1-RMASK gets an rmask representing the register which is used as the second integer parameter

  • IPARAM2-RMASK gets an rmask representing the register which is used as the third integer parameter

  • IPARAM3-RMASK gets an rmask representing the register which is used as the fourth integer parameter

  • IPARAM4-RMASK gets an rmask representing the register which is used as the fifth integer parameter

  • IPARAM5-RMASK gets an rmask representing the register which is used as the sixth integer parameter

  • IPARAM6-RMASK gets an rmask representing the register which is used as the seventh integer parameter

  • IPARAM7-RMASK gets an rmask representing the register which is used as the eighth integer parameter

  • FPARAM0-RMASK gets an rmask representing the register which is used as the first floating point parameter

  • FPARAM1-RMASK gets an rmask representing the register which is used as the second floating point parameter

  • FPARAM2-RMASK gets an rmask representing the register which is used as the third floating point parameter

  • FPARAM3-RMASK gets an rmask representing the register which is used as the fourth floating point parameter

  • FPARAM4-RMASK gets an rmask representing the register which is used as the fifth floating point parameter

  • FPARAM5-RMASK gets an rmask representing the register which is used as the sixth floating point parameter

  • FPARAM6-RMASK gets an rmask representing the register which is used as the seventh floating point parameter

  • FPARAM7-RMASK gets an rmask representing the register which is used as the eighth floating point parameter

  • PPRESERVED-DEPTH gets a pointer to the variable used to remember the return stack depth of registers preserved in a subroutine. The pointer is only valid until the buffer holding the variable is grown.

  • PUSED-RMASK gets a pointer to the variable used to keep track of registers not available for use. The pointer is only valid until the buffer holding the variable is grown.

  • OSYMBOL adds an export symbol to the export symbol list for the .o buffer's current compile offset

  • OSYMBOL-IMPORT compiles code for an import link if the os needs it and adds an import symbol to the import symbol list for the .o buffer's current compile offset

  • OSYMBOL-CODE-IMPORTS,< compiles a set of links for imported subroutines and adds them to the import symbol list

  • EH. finds the wordname after EH. in the export symbol list and gets the offset it represents

  • EH[1D]. finds the wordname after EH[1D]. in the import symbol list and gets the offset it represents

  • IMP operating system independent addressing mode specifier for a CALL, with an OIMPORTCODELINK target

  • OIMPORTCODELINK adjusts the offset on top of the data stack if needed for the os then pushes a constant representing the addressing mode of an import link for the os

  • BUF>NEW.OBUF takes the raw .o buffer being compiled and the export symbol list and uses it to build the operating system dependent file image in a new buffer. This function does not support imports.

  • BUF>.OFILE$ takes the raw .o buffer being compiled and the export symbol list and uses it to build the operating system dependent file image and save that image to a file. This function does not support imports.

  • BUF>NEWEXPORTIMPORT.OBUF takes the raw .o buffer being compiled, the export symbol list, and the import symbol list and uses it to build the operating system dependent file image in a new buffer.

  • (( begins a list of addressing mode targets for setting up parameters for a subroutine call, or for getting the parameters passed out from a subroutine call

  • )), uses a list of addressing mode targets to compile setting up parameters for a subroutine call or compile getting the parameters passed out from a subroutine call

  • Using this system, you first do NEW-FLAT-OSYMBOL-BUF to set things up.

    Then you need to do X86-WORDLIST>SEARCH-ORDER This step is not automated so that in the future when there is more than one assembler you can pick from, you can still choose.

    Here is an example of a simple exported subroutine that does not use a frame:

    OSYMBOL dg_firstfunction
      1122334455667788 N RAX MOV,
      RET,
    

    Here is an example of an exported subroutine using named parameters:

    OSYMBOL dg_thirdfunction
      NO-FRAME-PARAMS< INT x INT y >
      x RAX MOV, y RAX ADD,
      RET,
    
      ?CLEAR-LOCALS
    

    Here is an example of using the automated parameter setup stuff to call a function:

    OSYMBOL dg_ninthfunction
      ENTER-CALL-SUBS-FRAME,
      FRAME-PARAMS< INT w INT x INT y INT z >
      (( y >IPARAM z >IPARAM )), EH. dg_thirdfunction O CALL,
      EXIT-CALL-SUBS-FRAME,
    
      ?CLEAR-LOCALS
    

    Here is an example of importing a function in a way that works on both Mac for x86 and Windows:

    OSYMBOL dg_testsecondimportagain
      IMP CALL, OSYMBOL-IMPORT dg_secondfunction
      RET,
    
      ?CLEAR-LOCALS
    

    Note: on Mac you can use the IMP OSYMBOL-IMPORT words with any instruction that supports bracket [O]

    Also, if you want to convert a buffer offset to a pointer, like if you want to test the function with CALLPROC, you can use O>P for example: EH. myexportedsymbolname PCURRENTCOMPILEBUFFER @ O>P

Attempting to Make Platform Independent Assembly Source Code new in v5.8

It may be possible, for some functions, to write assembly source that is nearly platform independent using the new helper words in v5.8, at least between Mac and Windows. Where you run into trouble is when you are passing parameters to functions or calling functions that have mixed floating point and integer parameter lists. On Windows, the first four parameters are passed in registers regardless of type. On Mac, the first 6 integer parameters are passed in registers, and the first 8 floating point parameters are passed registers. If you are making your own new functions, and you put the floating point parameters after the integer parameters in the function prototype, you will probably be ok. The other issue is the x86 uses specific registers for certain instructions, like MUL, and MOVS, which are also used to pass parameters. You could either avoid using those instructions, or mark those registers as used manually... Marking them as used would at least get you portability between x86 Windows and Mac, but not other platforms. I put in generic RMASK words for those registers to help, but a truly cross platform solution will require more. There are a couple of other issues to worry about, like the direction flag if you are using it... Leaving the direction flag alone would work. If you have to use the direction flag, putting it back the way it was when you are done should also work. On both Mac and Windows, registers R10 and R11, along with any registers you did not use for parameters are available for use as local variables. This should mean you have at least two plus up to four more from unused parameters without preserving any. Future platforms may not have this. I plan on making a FAST-LOCALS< word which will use available registers first then allocate storage on the return stack so you don't have to worry about counting. But for these examples I'm going to assume there are two available for use with REGS<

Here is an example of writing an exported no frame function in a way that works on both Mac for x86 and Windows:

OSYMBOL dg_addtwointparams
  NO-FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 > 
  param0  ret0  MOV,
  param1  ret1  ADD,
  RET,

  ?CLEAR-LOCALS

Here is an example of writing an exported call subs frame function in a way that works on both Mac for x86 and Windows:

OSYMBOL dg_addtwointparamswithframe
  ENTER-CALL-SUBS-FRAME,
  FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 >
  param0  ret0  MOV,
  param1  ret0  ADD,
  EXIT-CALL-SUBS-FRAME,

  ?CLEAR-LOCALS

Here is an example of writing a sort of portable exported no frame function that puts a local variable in a register:

OSYMBOL dg_addtwointparamswithlocalreg
  NO-FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 >
  REGS< reg0 >
  param0  reg0  MOV,
  param1  reg0  ADD,
  reg0  ret0  MOV,
  RET,

  ?CLEAR-LOCALS

Here is an example of writing a sort of portable exported call subs frame function that puts a local variable in a register:

OSYMBOL dg_addtwointparamswithframeandlocalreg
  ENTER-CALL-SUBS-FRAME,
  FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 >
  REGS< reg0 >
  param0  reg0  MOV,
  param1  reg0  ADD,
  reg0  ret0  MOV,
  EXIT-CALL-SUBS-FRAME,

  ?CLEAR-LOCALS

Here is an example of writing a sort of portable exported no frame function that preserves parameter regs:

OSYMBOL dg_addtwointparamswithpreservedparams
  NO-FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 >
  IPARAM0-RMASK IPARAM1-RMASK | RMASK-NO-FRAME-PRESERVE, 
    // copies param0 and param1 to the return stack if they are regs
    // future references of param0 and param1 will refer to the copies 
    //  on the return stack
  REGS< reg0 >
  param0  reg0  MOV,
  param1  reg0  ADD,
  reg0  ret0  MOV,
  // don't need to restore parameter regs
  IPARAM0-RMASK IPARAM1-RMASK | COUNTBITS RETDROP,
  RET,

  ?CLEAR-LOCALS

Here is an example of writing a sort of portable exported no frame function that preserves regs that must be preserved:

OSYMBOL dg_addtwointparamswithpreservedmustbepreserved
  NO-FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 >
  3 U-IMUST-BE-PRESERVED-RMASK RMASK-NO-FRAME-PRESERVE, 
  REGS< reg0 >
  param0  reg0  MOV,
  param1  reg0  ADD,
  reg0  ret0  MOV,
  3 U-IMUST-BE-PRESERVED-RMASK RMASK-NO-FRAME-UNPRESERVE, 
    // does not adjust the return stack, just copies the regs
  3 RETDROP, 
    // will probably be 3 dropped, but if you try to preserve 
    //  more than there are, it could be less
  RET,

  ?CLEAR-LOCALS

Here is an example of writing a sort of portable exported call subs frame function that preserves parameter regs:

OSYMBOL dg_addtwointparamswithpreservedparameters
  ENTER-CALL-SUBS-FRAME,
  FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 >
  IPARAM0-RMASK IPARAM1-RMASK | RMASK-CALL-SUBS-FRAME-PRESERVE, 
    // copies param0 and param1 to local return stack frame 
    //  variables if they are regs
    // future references of param0 and param1 will refer to 
    //  the copies on the return stack frame
  REGS< reg0 >
  param0  reg0  MOV,
  param1  reg0  ADD,
  reg0  ret0  MOV,
  // don't need to restore parameter regs
  EXIT-CALL-SUBS-FRAME, 
    // this includes the return stack drop automatically

  ?CLEAR-LOCALS

Here is an example of writing a sort of portable exported call subs frame function that preserves regs that must be preserved:

OSYMBOL dg_addtwointparamswithpreservedmustbepreserved
  ENTER-CALL-SUBS-FRAME,
  FRAME-PARAMS< param0 param1 >
  CRETURNS< ret0 >
  3 U-IMUST-BE-PRESERVED-RMASK RMASK-CALL-SUBS-FRAME-PRESERVE, 
    // copies param0 and param1 to local return stack frame 
    //  variables if they are regs
    // future references of param0 and param1 will refer to 
    //  the copies on the return stack frame
  REGS< reg0 >
  param0  reg0  MOV,
  param1  reg0  ADD,
  reg0  ret0  MOV,
  3 U-IMUST-BE-PRESERVED-RMASK RMASK-CALL-SUBS-FRAME-UNPRESERVE, 
    // does not adjust the local return stack frame, just copies the regs
  EXIT-CALL-SUBS-FRAME, 
    // this includes the return stack drop automatically

  ?CLEAR-LOCALS

Using the Diaperglu linker

Diaperglu has it's own linker. The linker uses link description files to link to raw code files. If you would like to see a simple example on Mac OS X, do this:

  • Open a terminal.
  • Use cd to change to the /MaxOsX12.6.3/samplescripts subdirectory
  • Execute this command: ../diaperglu testglulinking.dglu

If you would like to see a more complicated example, do this:

  • Open a terminal.
  • Use cd to change to the /MaxOsX12.6.3/samplescripts subdirectory
  • Execute this command: ../diaperglu lifedglib.dglu

This builds two files: life.dglib is the raw compiled code file and life.glulist is the linker description file.

Then execute this command to test the compiled files: ../diaperglu testlifedglib.dglu

The linker uses symbol lists instead of word lists to hold the offsets of your subroutines. This means the linker can't find the offset of your routines if you use CODE. In the example you will see the X86-WORDLIST pushed to the search order manually and how to make a symbol list. The symbol list is simply a heirarchical list of name value pairs, where the name is your symbol's name, and the value is the offset in the current compile buffer.

Diaperglu Life example on Mac OS X

You can also look at the DiaperGluLifeApplication. (4/5/20 J.N.)

This application uses Diaperglu to link to the life.dglib file using the life.dglulist file.

The DiaperGluLifeApplication is a standard xcode project and is in the /MacOsX12.6.3/samplescripts/DiaperGluLifeApplication subdirectory.

If you want to run this demo on your system, and I apologize, you have to edit the DiaperGluLifeApplication.m file and put in the local path to where you installed libdiaperglu.dylib. This is because new Mac OS X security precautions prevent accessing shared libraries with relative path names unless that shared library is in the same subdirectory. Since I only want one copy of the diaperglu shared library in a release, you have to edit the path name.

Syntax:

This assembler follows the Forth convention where sources come first and destinations are later. For most two target instructions you can override this with the reverse operator <- which goes right after either the source or destination target.

For three target instructions, some of them require that the immediate target be the first target pushed onto the data stack. Some words can figure it out... Eventually I'll probably change it so that the compiler will figure out which one is the immediate target and not care where it is. Some instructions already do this.

The last thing in compiling an instruction is the compiling word itself. For example:

EAX EDX MOV, // this instruction compiles a move from EAX to EDX.

Some x86 instructions have the same name but a different number of parameters such as IMUL, or have the same name but compile completely different things like MOVQ. In order to keep it simple, I made different names for these situations like MOVQ2,. Otherwise you would have to push something on the stack for every single compiling word so that the compiling words could figure it out. I think adding a letter or two to the compiling word name is much better. For MOVQ I made MOVQ, the word that moves stuff from XMMn or STn registers to/from memory or XMMn or STn registers. And MOVQ2, is the word that moves stuff from XMMn registers to/from regular integer registers and memory.

// ///////////////////////////////////////////////////////////////////////////////////
//
// 64 bit mode addressing mode examples:
//
//  95 N              // an immediate value of 95, don't care about size encoding.
//                    //  except for the MOV, instruction with a register
//                    //  destination, the immediate size is limited to 32 bits.
//
//  95 4 IMMEDIATE    // an immediate value of 95, minimum encoding size is 4 bytes.
//                    //  not all instructions support all immediate size encodings.
//                    //  the immediate value is limited to 32 bits.
//
//  EAX               // a register target, in this case EAX
//
//  EAX R             // a register target, in this case EAX, some instructions
//                    //  have a shortcut way to encode register AL, AX, EAX, RAX
//                    //  targets. Some compiling words will not use the shortcut
//                    //  encoding for those instructions if you use R
//                    //  (I think you can use 0-15 instead with R)
//
//  RAX [R]           // a memory target, the address is in the RAX register
//                    //  and you didn't specify the size. If the compiler can
//                    //  figure out the size from the other target, it's not a
//                    //  problem.
//
//  RAX [R] 32BIT     // a 4 byte memory target. RAX holds the address.
//
//  RAX 92 [R+N]      // a memory target. Address is at RAX + 92.
//                    //  Data size not specified.
//                    //  Use smallest possible displacement size.
//
//  RAX SCALE2* RDX 92 [R+S*R+N]
//                    // a memory target. Address is at RAX + 2*RDX + 92.
//                    // Data size not specified.
//                    // Use smallest possible displacement size.
//
//  RAX SCALE2* YMM0 92 [R+S*YMMR+N]
//                    // multiple memory targets calculated using sections
//                    //  of the YMM0 register as multiple indexes.
//                    // Addresses are at RAX + 2*(YMM0[i]) + 92.
//                    // Data size not specified.
//                    // Use smallest possible displacement size.
//                    // This addressing mode only used with special instructions
//                    //  like the VGATHERDPD instruction.
//
//  32 RIP [R+N]      // a memory target. Address is pc relative, and is 32 bytes
//                    //  from the byte after instruction being compiled. (The byte
//                    //  after the instruction is offset 0). Data size is not
//                    //  specified. Use smallest possible displacement size.
//
//  32 [RIP+N]        // a memory target. Address is pc relative, and is 32 bytes
//                    //  from the byte after instruction being compiled. (The byte
//                    //  after the instruction is offset 0). Data size is not
//                    //  specified. Use smallest possible displacement size.
//
//  32 [O]            // a memory target. Address is pc relative and is at the 32
//                    //  bytes from the beginning of the buffer you are currently
//                    //  compiling in.
//
//  HEX C001C4 [N]    // a memory target. Address is C001C4. Unfortunately,
//                    //  addresses are limited to 32 bits with [N]. This isn't
//                    //  very useful in 64 bit mode... unless... Well there's
//                    //  this thing where Max OS X applications think they
//                    //  are loaded at address 0... I recommend not using
//                    //  this mode and just using [RIP+N] or [O]. But if you really
//                    //  want to use 64 bit absolute addresses, you can use
//                    //  the MOV, instruction with a register to move a 64
//                    //  bit address to the register and then use [R].
//                    //  There are also a set of special move instructions
//                    //  which move stuff to and from the al/ax/eax/rax
//                    //  register and 64 bit memory addresses. One is named
//                    //  MOV[N]->AL,
//
//   RAX 92 4 [MOD]   // a memory target. Address is at RAX + 92.
//                    // Data size is not specified.
//                    // Use at least 4 bytes to encode the displacement.
//
//   RAX SCALE2* RDX 92 2 [SIB]
//                    // a memory target.
//                    // Address is at RAX + 2*RDX + 92.
//                    // Data size not specified.
//                    // Use at least 2 bytes to encode the displacement.
//
//  RAX SCALE2* YMM0 92 2 [VSIB]
//                    // multiple memory targets calculated using sections
//                    //  of the YMM0 register as multiple indexes.
//                    // Addresses are at RAX + 2*(YMM0[i]) + 92.
//                    // Data size not specified.
//                    // Use at least 2 bytes to encode the displacement.
//                    // This addressing mode only used with special instructions
//                    //  like the VGATHERDPD instruction.
//
//  XMM0              // an xmm register target, in this case XMM0.
//
//  XMM0 XMMR         // an xmm register target, in this case XMM0.
//                    //  (I think you can use 0-15 instead with XMMR.)
//
//  ST0               // a floating point register target, in this case ST0.
//
//  ST0 FPSR          // a floating point register, in this case ST0.
//                    //  (I think you can use 0-7 instead with FPSR.)
//
//  5 FRAME-PARAM     // an xmm register, 64 bit integer register, or
//                    //  RBP nn [R+N] where nn is the offset to the parameter
//                    //  FRAME-PARAM assumes some variables have been set.
//                    //  These variables are set automatically when you use
//                    //  FRAME-PARAMS,< 
//
//  5 NO-FRAME-PARAM  // an xmm register, 64 bit integer register, or
//                    //  RSP nn [R+N] where nn is the offset to the paramter
//                    //  NO-FRAME-PARAM assumes some variables have been set.
//                    //  These variables are set automatically when you use
//                    //  NO-FRAME-PARAMS,<
//                    //  NO-FRAME-PARAMS also needs PRSDEPTH set to the number
//                    //   of items on the return stack. NO-FRAME-PARAMS,<
//                    //   sets this to the default number of items on the return
//                    //   stack at the entry of a function for the operating
//                    //   system. You need to add 1 to PRSDEPTH for each 64 bit
//                    //   item you push onto the return stack for NO-FRAME-PARAM
//                    //   to work correctly.
//
// ///////////////////////////////////////////////////////////////////////////////////

Mac OS X 64 bit calling convention:

You can pass parameters to your assembly language subroutines however you want. But if you want to be able to call them from C, or if you want to call C functions from your subroutines, you need to know this:

Mac OS X follows the x86 64 bit AMD-V calling convention.

Parameters are passed like this: (XMM1:XMM0 RDX:RAX) = f(RDI RSI RDX RCX R8 R9 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 RSP[1] RSP[2] RSP[3] ... RSP[N])

What this means is, the first 6 integer parameters are passed in registers RDI RSI RDX RCX R8 R9 where RDI is the first integer parameter. The first 8 floating point values are passed in registers XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 where XMM0 is the first floating point parameter. Anything that doesn't fit in registers is passed in on the return stack. Any extra integer parameters are pushed onto the return stack after any extra floating point parameters. (This means the integer parameters on the stack are at a lower address than the floating point ones. The first integer parameter on the stack is at the lowest address.) Also, the return stack grows down from higher address to lower addresses. Note that for things that don't fit into one register, like 128 bit integers, the whole thing has to fit into registers or it's is passed on the return stack.

Integer results are returned in registers RDX:RAX where RAX is the low 64 bits and RDX is the high 64 bits. (If you are only returning a 64 bit result then only RAX is used. If you are returning 128 bits, then both are used.)

Floating point results are returned in registers XMM1 and XMM0 where XMM1 is the high floating point return and XMM0 is the low floating point return. I'm guessing most of the time you will only use XMM0 because a double floating point number is only 64 bits and XMM0 can hold 2 of them.

The return stack must be aligned to a 16 byte boundary when you call a function. If you are calling your own subroutines and you know they don't use any instructions that will require alignment, you may be able to ignore this one. If you are calling any operating system functions, it's it's a really good idea to align the stack. When your subroutine gets called, what this means is the return address is on top of the stack, and the first return stack parameter, if there is one, is 16 byte aligned. Since most of the time all data will be passed in registers this means you will have to push a fake parameter onto the stack. The exception to this is at program entry. The return stack is 16 byte aligned at program entry, so you will probably not need to push a fake parameter for subroutines you call at program entry. If you want to be sure of the alignment under any circumstances, you can do what I do and AND the RSP with -0x10 to force 16 byte alignment. If you use this technique in a subroutine, you will need to recover the return stack pointer before returning, but that's one of the things the RBP frame register is for. In 64 bit mode each push to the return stack is 8 bytes.... 2 pushes = 16 bytes.

If you are passing stuff bigger than 128 bytes on the return stack, you are supposed to align your stack to a larger boundary than 16 bytes. But it seems to me you would want the big thing aligned to the boundary so an instruction requiring alignment can use it... not just the whole parameter list.

There are other types too like long double, __m256, and __m512. It gets kind of compilicated for these types, like some of these are passed using RDI as a pointer to memory on the return stack. And it seems like this doesn't happen all the time with these types. It may depend on what will fit into registers for some. Since I'm not to familiar with how to use these I'm just going to refer you to the System V ABI.

Redzone. In other calling conventions you are supposed to make sure stuff you use in your subroutine in the return stack area of memory is actually on the return stack. If an interrupt or task switch happens, then the interrupting routine is not supposed to change anything that is already on the stack. The System V ABI gives you a 128 byte area off the stack. If you trust interrupt handlers, you can use this area. In other words, you don't have to push your local variables onto the return stack up to 128 bytes worth of them. But then you can't call any subroutines that pass parameters on the stack or that have their own local variables without overwriting your local data... I would highly recommend you not use the red zone.

Preserved here means you must put it back the way you found it after you use it.

Registers that must be preserved: RBP RBX R12 R13 R14 R15

Direction flag must be set to forward (clear). What I do is push the flags on entry and pop the flags on exit. This puts it back the way I got it. This also makes code kinda more portable to other operating systems... I recommend you not assume the direction flag was left on forward. Let's say some other app didn't follow the rule, and the operating system didn't clear it for you during the task switch... it would be kinda hard to figure out what was going wrong.

You are supposed to leave the processor in floating point mode. So if you use XMM instructions, you are supposed to put the processor back into FP mode with an EMMS instruction. If you are going to do any floating point stuff, I wouldn't assume everyone followed this rule. The XMM instructions have ST0 ST1 ST2 ST3 ST4 ST5 ST6 and/or ST7 as one or both of the parameters.

The MXCSR register control bits must be preserved.

The MXCSR register status bits are not preserved.

The x87 status word is not preserved.

The x87 control word is preserved.

Note: System V ABI doc mentions an FEMMS instruction as an alternative to EMMS but Intel docs do not have FEMMS. Nor does Intel support the FEMMS instructions. AMD docs have FEMMS but recommend you do not use it. J.N. 4/6/2020

Windows 64 bit calling convention:

You can pass parameters to your assembly language subroutines however you want. But if you want to be able to call them from C, or if you want to call C functions from your subroutines, you need to know the MS x86-64 calling convention.

The first 4 arguments 64 bits or less are passed in registers, it doesn't matter if they are floating point or integer. Arguments after 4 are passed on the return stack

If all the arguments are 64 bit or less integers or pointers, the first 4 are passed in RCX RDX R8 R9

if all the arguments are 64 bit or less floating points, the first 4 are passed in: XMM0 XMM1 XMM2 XMM3

If the first 4 arguments are mixed floating point and integer/pointer 64 bits or less then, the register for that argument position and type is used, for example: if the first argument is integer and the second argument is floating point then rcx is used for the first argument and xmm1 is used for the second argument.

If any of the first 4 arguments are varargs and floating point, they are supposed to be passed in the integer register for that position. However, Windows recommends you pass these floating point varargs in both the integer and floating point registers just in case.

arguments after 4 are passed on the return stack in the order they appear in the c prototype in other words, argument 5 is the first argument after the 4 argument shadow region going up in memory, and argument 6 is the second argument after the 4 argument shadow region going up in memory. Four fake parameters are passed on the stack and they are intended to hold copies of RCX RDX R8 and R9, but you can use them as local storage for whatever you want if you like.

64 bit integers and less are return in RAX

Parameters over 64 bits are passed in memory. This means you have to pass a 64 bit pointer to the memory holding the parameter that is over 64 bits. If the parameter is a structure, it likely has to be 16 byte aligned. If you are calling a Windows operating system function that passes a pointer to a structure, the structure very likely HAS to be 16 byte aligned. This is particularly true for Windows functions which copy the structure. If you are passing a pointer to a null terminated string, I'm pretty sure the string does not have to be aligned. There may be exceptions for some things but the documentation didn't list those. (see Alignment section of the calling convention page) https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170

Returns over 64 bits are not supported. But you can pass a pointer to memory as one of the arguments and use that to return the larger part of the value.

RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-XMM15 must be preserved.

any functions that can throw an exception such as a bad memory error must use the microsoft standard frame format and leave the RSP 16 byte aligned. (So when the exception happens, the RSP must be 16 byte aligned.) https://docs.microsoft.com/en-us/cpp/build/stack-usage?view=vs-2019

    The frame is this:
  • param n
  • ...
  • param 5
  • shadow r9>
  • shadow r8
  • shadow rdx
  • shadow rcx (this address is 16 byte aligned)
  • return address
  • saved registers that must be preserved
  • local variables // the frame pointer usually points to the lowest memory local variable

Functions that can throw exceptions must also generate unwind table data. Masm has directives to make this easier and they are documented here: https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2019. DiaperGlu does not... If you have a function that can throw exceptions and do not generate a correct unwind table for it, what I've noticed is, sometimes it may work and sometimes it may not. Like as in, when you call the same code there is a 50/50 chance the exception will be handled properly. It's not a good idea to skip this if you want to trap exceptions. P.S. If you are pushing the flags register in the prolog, it's considered to be a sub rsp 8 unwind thing.

There are end of subroutine requirements for functions that can throw exceptions which are these: https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=vs-2019

Docs say you can put stuff on the return stack after the frame, but it may confuse the unwind code...

If you are calling C functions, in most cases the rsp has to be 16 byte aligned after you push the parameters and are just about to do the call. You also have to push 4 dummy parameters for the called functions to use, and for the unwind code if any exceptions happen. You can kinda get away with not following these rules if the subroutine you call does not throw exceptions, does not use any opcodes requiring the return stack to be 16 byte aligned, and the subroutine does not use the 4 dummy shadow region as local storage. Since most of the time all data will be passed in registers this means you will have to push a fake parameter onto the stack. The exception to this is at program entry. I couldn't find anything that says what the alignment is at program entry, but since it's probably a new empty stack, the return stack is probably already 16 byte aligned at program entry. The documentation does say the return stack has to be 16 byte aligned at all times except during a function call prologue... and it isn't 16 byte aligned at function entry. If you want to be sure of the alignment under any circumstances, you can do what I do and AND the RSP with -0x10 to force 16 byte alignment. If you use this technique in a subroutine, you will need to recover the return stack pointer before returning, but that's one of the things the RBP frame register is for.

Position independent code

In 64 bit mode it is really easy to do position independent code. This means you don't have to worry about where your code will be loaded, or if you are compiling to a buffer that can move, the code will still work if the buffer moves. I read somewhere that newer versions of Mac OS X somehow trick your applications into thinking they loaded at address 0, so doing position independent code might not be needed any more... but I still recommend it.

Here is one way to do it using Diaperglu Forth wordlists to hold the symbols:

DECIMAL

OHERE CONSTANT oMyData   // offset in current compile buffer of your data
12345678 CODE-U64,       // your data

CODE myfunctionCALL,      // standard C function to return the number
  oMyData [O]  RAX  MOV,  // compiles an [RIP+N] instruction to get oMyData
  RET,
END-CODE

CODE callmyfunctionCALL,
  myfunctionCALL,         // compiles a pc relative call to myfunction
  RET,
END-CODE

CODE callthehardwayCALL,
  ' myfunction >BODY
  PCURRENTCOMPILEBUFFER @ DUP
  LENGTHBUF SWAP O>P 5 + - RIP+N CALL,
  RET,
END-CODE

Here's a way to call the functions you just compiled.

0                           // number of parameters
' myfunctionCALL, >BODY     // get the address (you may need to use the address
                            //  before you compile anything else)
CALLPROC                    // call the function
U.                          // see the result

0                           // number of parameters
' callthehardwayCALL, >BODY // get the address (you may need to use the address
                            //  before you compile anything else)
CALLPROC                    // call the function
U.                          // see the result

Here is another way using Diaperlgu symbol lists to hold the symbols:

X86-WORDLIST >SEARCH-ORDER  // Adding the assembler to the search order stack.
                            // The assembler is not normally in the search order
                            // because it is really big and some word names
                            // may conflict with other things.
 


NEW-NGLULIST>EH             // making a hierarchical list to hold a symbol list
                            // and making it the current elementid hlistid
                            // ( noparentelementid_myhlistid -EH- )

                            
$" " 0 >NEW$  EH-NEW-ELEMENT>EH  // adding a root element to the current EH hlist
                                 // to hold the symbol list
                            
        ( noparentelementid_myhlistid symbolsparentelemendid_mylistid -EH- )

DECIMAL
OSYMBOL oMyData  987654 CODE-U64,
  
OSYMBOL oMyFunction
  EH-NAMEW>VALUE oMyData [O]  RAX  MOV,
  RET,
  
OSYMBOL oCallMyFunction
  EH-NAMEW>VALUE oMyFunction O CALL,
  RET,
  
SEARCH-ORDER> DROP  // removing the assembler from the search order stack

One way to call the function:

0                                         // the number of parameters
EH-NAMEW>VALUE oCallMyFunction            // find the offset of the function
PCURRENTCOMPILEBUFFER @ GETSBUFFER DROP   // get the current base address of your
                                          // current compile buffer
+                                         // get the address of the function
CALLPROC                                  // call the function
U.                                        // see the result

// Cleaning up

DROPEH                                    // drop symbolsparentelemendid_mylistid
                                          //  from the EH stack
EH>                                       // pop the hlist id from the EH stack
FREE-HLIST                                // throw away the symbol list hlist
DROP                                      // drop the left over no parent element id
// //////////////////////////////////////////////////////////////////////////////////////
//
// How to compile branches and loops:
//
// You may have noticed the X86 jump and conditional jump instructions are missing.
//  If you want to use those, you can use these instead:
//   COMPILE-BRANCH
//   RESOLVE-COMPILED-BRANCH
//
// To get a jump, use the condition code ALWAYS.
//
// However, this assembler comes with the standard Forth assembler flow control
//  compiling words (and a few others) which eliminate the need for a lot of labels.
//  These compiling words are:
//   BEGIN,
//   UNTIL,
//   IF,
//   ELSE,
//   THEN,
//   WHILE,
//   REPEAT,
//   LOOPNOTDONEWHILE,
//   NZORLOOPNOTDONEWHILE,
//   NEORLOOPNOTDONEWHILE,
//   ZSORLOOPNOTDONEWHILE,
//   EQORLOOPNOTDONEWHILE,
//   LOOPDONEUNTIL,
//   ZSORLOOPDONEUNTIL,
//   EQORLOOPDONEUNTIL,
//   NEORLOOPDONEUNTIL,
//   NZORLOOPDONEUNTIL,
//
//
// These are the condition code names:
//
//   VS   for overflow set
//   NV   for overflow clear or no overflow
//   CS   for carry set
//   NC   for overflow clear or no carry
//   ULT  for unsigned less than
//   ULE  for unsigned less than or equal
//   UGT  for unsigned greater than
//   UGE  for unsigned greater than or equal
//   ZS   for zero set
//   NZ   for zero clear or no zero
//   EQ   for equal or zero set
//   NE   for not equal or zero clear
//   SS   for sign set
//   NS   for no sign or sign clear
//   MI   for minus or sign set
//   PL   for plus or sign clear
//   PS   for parity set
//   NP   for no parity or parity clear
//   LT   for signed less than
//   GE   for signed greater than or equal
//   LE   for signed less than or equal
//   GT   for signed greater than
//   ALWAYS
//   NEVER
//
//
// Example of using IF, THEN,
//
// This function returns true if you pass in a carriage return in the
//  low byte in RDI, false otherwise.
//
// HEX
// CODE dg_queryiscr       // C prototype: UINT64 dg_queryiscr ( unsigned char c );
//  RAX RAX XOR,           //  this checks low byte of RDI for a carriage return
//  0D N  DIL  CMP,        //  if it is, return true, otherwise return false
//  EQ IF,
//   RAX DEC,
//  THEN,
//  RET,
// END-CODE
//
// This should return -1:
//
//  0D 1 ' dg_queryiscr >BODY CALLPROC U.
//
// And this should return 0:
//
//  OC 1 ' dg_queryiscr >BODY CALLPROC U.
//
//
// Example of using BEGIN, UNTIL,
//
// This function counts the number of set bits passed in. And yes there is an
//  x86 instruction that does this in one step.
//
// HEX
// CODE dg_countbits    // C prototype: UINT64 dg_countbits ( UINT64 u );
//  RAX RAX XOR,        // clears RAX
//  40 N  DL  MOV,      // There are 64 bits to count
//  BEGIN,
//   1 N  RDI  SHL,     // shifts a bit out of RDI into the carry flag
//   0 N  AL  ADC,      // adds 0 + the carry flag to AL
//   DL  DEC,           // decrements the loop counter and sets the zero flag if 0
//  EQ UNTIL,           // keep looping until the zero flag is set
//  RET,
// END-CODE
//
// This should return 5
//
// HEX 30038000 1 ' dg_countbits >BODY CALLPROC U.
//
//
// Example of using BEGIN, WHILE, REPEAT,
//
// This function looks for a 0 byte in the number passed in. This is sort of like
//  getting the length of a c style null terminated string if the string was stored
//  in a UINT64. If no 0 is found, 8 is returned, otherwide the byte index of
//  where 0 was found is returned.
//
// HEX
// CODE dg_countcharacters // C prototype: UINT64 dg_countcharacters ( UINT64 u );
//  RAX RAX XOR,           // clears RAX
//  8 N  DL  MOV,          // There are possible 8 bytes to count
//  BEGIN,
//   DIL 0 N CMP,          // is the low byte of RDI a 0?
//  NE WHILE,              // if not then keep going
//   8 N RDI SHR,          // shift RDI one byte to the right
//   AL INC,               // increment the counter
//  REPEAT,
//  RET,
// END-CODE
//
// This should return 3
//
// HEX 554400332211 1 ' dg_countcharacters >BODY CALLPROC U.
//
// //////////////////////////////////////////////////////////////////////////////////////

Reference documentation:

links checked April 6, 2020

// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_checkifvaluefits
//
// C prototype: 
//  const char* dg_checkifvaluefits(
//   UINT64 valuesizeinmem,
//   UINT64 usedvaluesize,   // sign extended to this size
//   UINT64 value)
//
// Inputs:
//  UINT64 valuesizeinmem
//  UINT64 usedvaluesize
//  UINT64 value
//
// Return:
//  const char* return        dg_success  if value fits
//                            dg_checkifvaluefitssizebaderror  if
//                             valuesizeinmem > usedvaluesize
//                            dg_valuetoobigerror  if the value is too big to fit
//
// Action:
//  Tests whether or not value will fit in the size needed when it is compiled
//   and returns an error flag if it does not.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_isrexbaseorindexreg
//
// C prototype: 
//  UINT64 dg_isrexbaseorindexreg (UINT64 reg)
//
// Inputs:
//  UINT64 reg
//
// Return:
//  BOOL return             FORTH_TRUE if reg is on of:
//                           R8, R9, R10, R11, R12, R13, R14 or R15
//                          FORTH_FALSE otherwise
//
// Action:
//  Tests whether or not the register is a REX base or index register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_isrexsrcordestreg
//
// C prototype: 
//  UINT64 dg_isrexsrcordestreg (UINT64 reg)
//
// Inputs:
//  UINT64 reg
//
// Return:
//  BOOL return             FORTH_TRUE if reg is one of:
//                           R8L, R9L, R10L, R11L, R12L, R13L, R14L, R15L,
//                           R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W,
//                           R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D,
//                           R8, R9, R10, R11, R12, R13, R14, or R15
//                          FORTH_FALSE otherwise
//
// Action:
//  Tests whether or not the register is a REX source or destination register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_isdefaultaddresssizeonetarget
//
// C prototype: 
//  UINT64 dg_isdefaultaddresssizeonetarget (struct Onetargetopcodestrings* popcodes)
//
// Inputs:
//  struct Onetargetopcodestrings* popcodes
//
// Return:
//  BOOL return           FORTH_TRUE if opcode table has the opcode for PUSH, or POP,
//                        FORTH_FALSE otherwise
//
// Action:
//
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovntorax
//
// C prototype: 
//  void dg_compilemovntorax (
//   Bufferhandle* pBHarrayhead,
//   UINT64 u)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  UINT64        u               UINT64 value to go into RAX
//
// Return: none
//
// Action:
//  Compiles code to move a UINT64 value into RAX.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovntoeax
//
// C prototype: 
//  void dg_compilemovntorax (
//   Bufferhandle* pBHarrayhead,
//   UINT64 u)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  UINT64        u               UINT32 value to go into EAX, 
//                                 u is truncated to 32 bits
//
// Return: none
//
// Action:
//  Compiles code to move a UINT32 value into EAX.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileaddn32torax
//
// C prototype: 
//  void dg_compileaddn32torax (
//   Bufferhandle* pBHarrayhead,
//   UINT64 u)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        u               UINT32 value to add to RAX, 
//                                 u is truncated to 32 bits
//
// Return: none
//
// Action:
//  Compiles code to add a UINT32 value to RAX.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilesubn8fromrsp
//
// C prototype: 
//  void dg_compilesubn8fromrsp (
//   Bufferhandle* pBHarrayhead,
//   INT64 n8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  INT64        n8               signed INT8 value to get subtracted from RSP 
//
// Return: none
//
// Action:
//  Compiles code to subtract a signed one byte immediate value from RSP.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileaddn8torsp
//
// C prototype: 
//  void dg_compileaddn8torsp (
//   Bufferhandle* pBHarrayhead,
//   INT64 n8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  INT64        n8               signed INT8 value to get added to RSP 
//
// Return: none
//
// Action:
//  Compiles code to add a signed one byte immediate value to RSP.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushn64toret
//
// C prototype: 
//  void dg_compilepushn64toret (
//   Bufferhandle* pBHarrayhead,
//   UINT64 n64)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  UINT64        n64             64 bit value to get pushed to ret 
//
// Return: none
//
// Action:
//  Compiles code to push a 64 bit integer to the return stack.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilesubnfromrsp
//
// C prototype: 
//  void dg_compilesubnfromrsp (
//   Bufferhandle* pBHarrayhead,
//   UINT64 u31)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  UINT64        u31             unsigned UINT31 value to get subtracted from RSP 
//
// Return: none
//
// Action:
//  Compiles code to subtract an immediate value from RSP using
//    the shorter of either a signed one byte immediate value or signed 4 byte
//    immediate value. This means if u31 is 0, it still compiles using the one byte
//    immediate value encoding.
//
// Note:
//   This function does not handle negative values for u31.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushu64toret
//
// C prototype: 
//  void dg_compilepushu64toret (
//   Bufferhandle* pBHarrayhead,
//   UINT64 u64)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  UINT64        u64             unsigned UINT64 value to get pushed to the return
//                                 stack 
//
// Return: none
//
// Action:
//  Compiles code to push an immediate 64BIT value to RSP.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileaddn8torsp
//
// C prototype: 
//  void dg_compileaddn8torsp (
//   Bufferhandle* pBHarrayhead,
//   INT64 n8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  INT64        n8               signed INT8 value to get added to RSP 
//
// Return: none
//
// Action:
//  Compiles code to add a signed one byte immediate value to RSP.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileaddntorsp
//
// C prototype: 
//  void dg_compileaddntorsp (
//   Bufferhandle* pBHarrayhead,
//   UINT64 u31)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
//  UINT64        u31             unsigned UINT31 value to get added to RSP 
//
// Return: none
//
// Action:
//  Compiles code to add an unsigned immediate value to RSP using
//    the shorter of either a signed one byte immediate value or signed 4 byte
//    immediate value. This means if u31 is 0, it still compiles using the one byte
//    immediate value encoding.
//
// Note:
//   This function does not handle negative values for u31.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovntoreg
//
// C prototype: 
//  void dg_compilemovntoreg(
//   Bufferhandle* pBHarrayhead,
//   UINT64 x,
//   UINT64 reg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        x               UINT64 value to add to register
//  UINT64        reg             64 bit target register
//
// Return: none
//
// Action:
//  Compiles code to move a 64 bit value into a 64 bit register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtoreg
//
// C prototype: 
//  void dg_compilemovregtoreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 srcreg,
//   UINT64 destreg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        srcreg          64 bit source integer register
//  UINT64        destreg         64 bit destination integer register
//
// Return: none
//
// Action:
//  Compiles code to move a 64 bit value from a 64 bit source integer register
//   to a 64 bit destination integer register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovfregtofreg
//
// C prototype: 
//  void dg_compilemovfregtofreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 srcreg,
//   UINT64 destreg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        srcreg          64 bit source floating point register
//  UINT64        destreg         64 bit destination floating point register
//
// Return: none
//
// Action:
//  Compiles code to move a 64 bit value from a 64 bit source floating point register
//   to a 64 bit floating point destination register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileaddregtoreg
//
// C prototype: 
//  void dg_compileaddregtoreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 srcreg,
//   UINT64 destreg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        srcreg          64 bit source register
//  UINT64        destreg         64 bit destination register
//
// Return: none
//
// Action:
//  Compiles code to add a 64 bit value from a 64 bit source register
//   to a 64 bit destination register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrsptoreg
//
// C prototype: 
//  void dg_compilemovbracketrsptoreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 ireg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg            64 bit destination integer register
//
// Return: none
//
// Action:
//  Compiles RSP [R] ireg -> MOV, where ireg is a 64 bit integer register.
//  Another way to word this is it compiles code to copy the 64 bit value in
//   the memory at the address in RSP to the 64 bit integer register ireg.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrsptofreg
//
// C prototype: 
//  void dg_compilemovbracketrsptofreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 freg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        freg            64 bit destination floating point register
//
// Return: none
//
// Action:
//  Compiles RSP [R] freg -> MOV, where freg is a 64 bit floating point register.
//  Another way to word this is it compiles code to copy the 64 bit value in
//   the memory at the address in RSP to the 64 bit floating point register freg.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrspd8toreg
//
// C prototype: 
//  void dg_compilemovbracketrspd8toreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 ireg,
//   INT64 displacement8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg            64 bit destination integer register
//  INT64         displacement8   signed 8 bit offset
//
// Return: none
//
// Action:
//  Compiles RSP displacement8 [R+N] reg -> MOV, where displacement8 is 8 bits and 
//   ireg is a 64 bit integer register.
//  Another way to word this is it compiles code to copy the 64 bit value from the
//   memory at the address in RSP plus a signed 8 bit offset to a 64 bit integer
//   register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrspd8tofreg
//
// C prototype: 
//  void dg_compilemovbracketrspd8tofreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 freg,
//   INT64 displacement8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        freg            64 bit destination floating point register
//  INT64         displacement8   signed 8 bit offset
//
// Return: none
//
// Action:
//  Compiles RSP displacement8 [R+N] freg -> MOV, where displacement8 is 8 bits and 
//   freg is a 64 bit floating point register.
//  Another way to word this is it compiles code to copy the 64 bit value from the
//   memory at the address in RSP plus a signed 8 bit offset to a 64 bit floating
//   point register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrbpd8toreg
//
// C prototype: 
//  void dg_compilemovbracketrbpd8toreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 ireg,
//   INT64 displacement8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg            64 bit integer destination register
//  INT64         displacement8   signed 8 bit offset
//
// Return: none
//
// Action:
//  Compiles RBP displacement8 [R+N] reg -> MOV, where displacement8 is 8 bits and 
//   ireg is a 64 bit integer register.
//  Another way to word this is it compiles code to copy the 64 bit value from the
//   memory at the address in RBP plus a signed 8 bit offset to a 64 bit integer
//   register
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrspd32toreg
//
// C prototype: 
//  void dg_compilemovbracketrspd32toreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg,
//   INT64 displacement32)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg            64 bit integer destination register
//  INT64         displacement32  signed 32 bit offset
//
// Return: none
//
// Action:
//  Compiles RSP displacement32 [R+N] ireg -> MOV, where displacement32 is 32 bits 
//   and ireg is a 64 bit integer register.
//  Another way to word this is it compiles code to copy the 64 bit value in
//   the memory at the address in RSP plus a signed 32 bit offset to an integer
//   register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrspd32tofreg
//
// C prototype: 
//  void dg_compilemovbracketrspd32tofreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 freg,
//   INT64 displacement32)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        freg            64 bit floating point destination register
//  INT64         displacement32  signed 32 bit offset
//
// Return: none
//
// Action:
//  Compiles RSP displacement32 [R+N] freg -> MOV, where displacement32 is 32 bits 
//   and freg is a 64 bit floating point register.
//  Another way to word this is it compiles code to copy the 64 bit value in
//   the memory at the address in RSP plus a signed 32 bit offset to a floating
//   point register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrbpd32toreg
//
// C prototype: 
//  void dg_compilemovbracketrbpd32toreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 ireg,
//   INT64 displacement32)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg            64 bit integer destination register
//  INT64         displacement32  signed 32 bit offset
//
// Return: none
//
// Action:
//  Compiles RBP displacement32 [R+N] ireg -> MOV, where displacement32 is 32 bits  
//   and ireg is 64 bits.
//  Another way to word this is it compiles code to copy the 64 bit value in
//   the memory at the address in RBP plus a signed 32 bit offset to an integer
//   register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovbracketrspdtoreg
//
// C prototype: 
//  void dg_compilemovbracketrspdtoreg (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg,
//   INT64 displacement)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        reg             64 bit destination register
//  INT64         displacement    signed offset (up to 32 bits)
//
// Return: none
//
// Action:
//  Compiles RSP displacement [R+N] ireg -> MOV, or
//   RSP displacement [R+N] freg -> MOVQ, where displacement is 32 bits or  
//   less and reg is 64 bits using the smallest displacement possible.
//  Another way to word this is it compiles code to copy the 64 bit value in
//   the memory at the address in RSP plus a signed up to 32 bit offset to an integer
//   or floating point register.
//   (Works for REX and floating point registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtobracketrsp
//
// C prototype: 
//  void dg_compilemovregtobracketrsp (
//   Bufferhandle* pBHarrayhead,
//   UINT64 ireg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg             64 bit integer source register
//
// Return: none
//
// Action:
//  Compiles ireg RSP [R] -> MOV, where ireg is a 64 bit integer register.
//  Another way to word this is it compiles code to copy the 64 bit value
//   at RSP to a 64 bit integer register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovfregtobracketrsp
//
// C prototype: 
//  void dg_compilemovfregtobracketrsp (
//   Bufferhandle* pBHarrayhead,
//   UINT64 freg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        freg            64 bit floating point source register
//
// Return: none
//
// Action:
//  Compiles freg RSP [R] -> MOVQ, where freg is a 64 bit floating point register.
//  Another way to word this is it compiles code to copy the 64 bit value
//   at RSP to a 64 bit floating point register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtobracketrspd8
//
// C prototype: 
//  void dg_compilemovregtobracketrspd8 (
//   Bufferhandle* pBHarrayhead,
//   UINT64 ireg,
//   INT64 displacement8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg            64 bit integer source register
//  INT64         displacement8   signed 8 bit displacement
//
// Return: none
//
// Action:
//  Compiles ireg RSP displacement8 [R+N] -> MOV, where displacement8 is 8 bits and 
//   ireg is a 64 bit integer register.
//  Another way to word this is it compiles code to copy the 64 bit value
//   in memory at the address in RBP plus a signed 8 bit offset to a 64 bit intger
//   register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovfregtobracketrspd8
//
// C prototype: 
//  void dg_compilemovfregtobracketrspd8 (
//   Bufferhandle* pBHarrayhead,
//   UINT64 freg,
//   INT64 displacement8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        freg            64 bit floating point source register
//  INT64         displacement8   signed 8 bit displacement
//
// Return: none
//
// Action:
//  Compiles freg RSP displacement8 [R+N] -> MOVQ, where displacement8 is 8 bits and 
//   freg is a 64 bit floating point register.
//  Another way to word this is it compiles code to copy the 64 bit value
//   in memory at the address in RSP plus a signed 8 bit offset to a 64 bit floating
//   point register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtobracketrbpd8
//
// C prototype: 
//  void dg_compilemovregtobracketrbpd8 (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg,
//   INT64 displacement8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        reg             64 bit source register
//  INT64         displacement8   signed 8 bit displacement
//
// Return: none
//
// Action:
//  Compiles RBP n [R+N] reg -> MOV, where n is 8 bits and reg is 64 bits.
//  Another way to word this is it compiles code to copy the 64 bit value
//   at RBP plus a signed 8 bit offset to a 64 bit register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtobracketrbpd32
//
// C prototype: 
//  void dg_compilemovregtobracketrbpd32 (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg,
//   INT64 displacement32)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        reg             64 bit source register
//  INT64         displacement32  signed 32 bit displacement
//
// Return: none
//
// Action:
//  Compiles RBP n [R+N] reg -> MOV, where n is 32 bits and reg is 64 bits.
//  Another way to word this is it compiles code to copy the 64 bit value
//   at RBP plus a signed 32 bit offset to a 64 bit register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtobracketrbpd
//
// C prototype: 
//  void dg_compilemovregtobracketrbpd (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg,
//   INT64 displacement)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        reg             64 bit source register
//  INT64         displacement    signed displacement
//
// Return: none
//
// Action:
//  Compiles RBP n [R+N] reg -> MOV, where n is 8 or 32 bits and reg is 64 bits.
//  Another way to word this is it compiles code to copy the 64 bit value
//   at RBP plus a signed 8 or 32 bit offset to a 64 bit register. This uses the
//   smallest possible displacement. Does not support floating point registers.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushbracketrbpd8
//
// C prototype: 
//  void dg_compilepushbracketrbpd8 (
//   Bufferhandle* pBHarrayhead,
//   INT64 displacement8)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  INT64        displacement8    signed 8 bit displacement
//
// Return: none
//
// Action:
//  Compiles RBP n [R+N] -> PUSH, where n is 8 bits and reg is 64 bits.
//  Another way to word this is 'compiles code to push the 64 bit value
//   at RBP plus an 8 bit offset to the return stack.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtobracketrspd32
//
// C prototype: 
//  void dg_compilemovregtobracketrspd32 (
//   Bufferhandle* pBHarrayhead,
//   UINT64 ireg,
//   INT64 displacement32)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        ireg            64 bit integer source register
//  INT64         displacement32  signed 32 bit displacement
//
// Return: none
//
// Action:
//  Compiles ireg RSP displacement32 [R+N] -> MOV, where displacement32 is 32 bits and 
//   ireg is a 64 bit integer register.
//  Another way to word this is it compiles code to copy the 64 bit value
//   in memory at the address in RSP plus a signed 32 bit offset to a 64 bit intger
//   register.
//   (Works for REX registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovfregtobracketrspd32
//
// C prototype: 
//  void dg_compilemovfregtobracketrspd32 (
//   Bufferhandle* pBHarrayhead,
//   UINT64 freg,
//   INT64 displacement32)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        freg            64 bit floating point source register
//  INT64         displacement32  signed 32 bit displacement
//
// Return: none
//
// Action:
//  Compiles freg RSP displacement32 [R+N] -> MOVQ, where displacement32 is 32 bits and 
//   freg is a 64 bit floating point register.
//  Another way to word this is it compiles code to copy the 64 bit value
//   in memory at the address in RSP plus a signed 32 bit offset to a 64 bit floating
//   point register.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemovregtobracketrspd
//
// C prototype: 
//  void dg_compilemovregtobracketrspd (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg,
//   INT64 displacement)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//  UINT64        reg             64 bit source register
//  INT64         displacement    signed up to 32 bit displacement
//
// Return: none
//
// Action:
//  Compiles ireg RSP displacement32 [R+N] -> MOV, or 
//   freg RSP displacement [R+N] -> MOVQ, where displacement is up to 32 bits and 
//   reg is a 64 bit integer or floating point register.
//  Another way to word this is it compiles code to copy the 64 bit value
//   in memory at the address in RSP plus an up to signed 32 bit offset to a 64 bit 
//   integer or floating point register using the smallest possible offset size.
//  Displacement will be compiled using 0, 8, or 32 bits.
//  (Works for REX registers and floating point registers too.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilejumptorax ( RAXJMP, EAXJMP, )
//
// C prototype: 
//  void dg_compilejumptorax (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//                              
// Action:
//  Compiles RAX JMP,
//  REX is not needed to indicate a 64 bit address so REX is not compiled for
//   this instruction.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecalltorax
//
// C prototype: 
//  void dg_compilecalltorax (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//                              
// Action:
//  Compiles RAX CALL,
//  REX is not needed to indicate a 64 bit address so REX is not compiled for
//   this instruction.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilereturn ( RET, )
//
// C prototype: 
//  void dg_compilereturn (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//                              
// Action:
//  Compiles a return from subroutine onto the end of the current compile buffer.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileclc ( CLC, )
//
// C prototype: 
//  void dg_compileclc (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//                              
// Action:
//  Compiles a clear carry flag operation onto the end of the current compile 
//   buffer.
//
// Note:
//  This function is not used in Diaperglu, but is used in the automated test
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilestc ( STC, )
//
// C prototype: 
//  void dg_compilestc (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//                              
// Action:
//  Compiles a set carry flag operation onto the end of the current compile buffer.
//  This function is not used in Diaperglu, but is used in the automated test.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecalloffset
//
// C prototype: 
//  void dg_compilecalloffset (
//   Bufferhandle* pBHarrayhead,
//   INT64 offset)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//  INT64         offset          program counter relative signed 32 bit offset
//
// Outputs:
//  none
//                              
// Action:
//  Compiles a program counter relative call onto the end of the current compile
//   buffer.
//  The displacement compiled into the call is the offset passed in to this
//   routine. The offset must fit into a signed 32 bit integer.
//   Be nice if the code aultomatically did larger displacements... but will this
//   ever be needed?
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilejmpoffset
//
// C prototype: 
//  void dg_compilejmpoffset (
//   Bufferhandle* pBHarrayhead,
//   INT64 offset)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//  INT64         offset          program counter relative signed 32 bit offset
//
// Outputs:
//  none
//                              
// Action:
//  Compiles a program counter relative jump onto the end of the current compile
//   buffer.
//  The displacement compiled into the jump is the offset passed in to this
//   routine. The offset must fit into a signed 32 bit integer.
//  This does RIP+offset->RIP.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilejmpbracketoffset
//
// C prototype: 
//  void dg_compilejmpbracketoffset (
//   Bufferhandle* pBHarrayhead,
//   INT64 offset)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//  INT64         offset          program counter relative signed 32 bit offset
//
// Outputs:
//  none
//                              
// Action:
//  Compiles a jump to the 64BIT address in the memory at the program counter 
//   relative address onto the end of the current compile buffer.
//  The displacement compiled into the jump is the offset passed in to this
//   routine. The offset must fit into a signed 32 bit integer.
//  This does [RIP+offset]->RIP.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileacopyofsscopyto
//
// C prototype: 
//  void dg_compileacopyofsscopyto(
//    Bufferhandle* pBHarrayhead,
//    unsigned char* psqstr,
//    UINT64 sqstrlen)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//  unsigned char* psqstr         pointer to source string
//  UINT64 sqstrlen               source string length in bytes
//
// Outputs:
//  none
//                              
// Action:
//  Compiles code to push the address and length of a copy of the source string
//   to the data stack. 
//  (Compiles a call over the copy of the string, which gives the address on the
//   return stack, then compiles code to push the address and length to the data
//   data stack.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecopystonewstring
//
// C prototype: 
//  void dg_compilecopystonewstring (
//    Bufferhandle* pBHarrayhead,
//    const char* pstring,
//    UINT64 stringlength)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//  const char* pstring           pointer to source string
//  UINT64 stringlength           source string length in bytes
//
// Outputs:
//  none
//                              
// Action:
//  Compiles code to push a copy of the source string to the string stack. 
//  (Compiles a call over a copy of the string, which gives the address on the
//   return stack, then compiles code to push a copy of this copy to a
//   new string on the string stack.)
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecalloffsetinsamebuffer
//
// C prototype: 
//  void dg_compilecalloffsetinsamebuffer (
//   Bufferhandle* pBHarrayhead,
//   INT64 targetoffset)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where  
//                                 the other bufferhandles are stored.
//  INT64         targetoffset    offset from beginning of current compile buffer
//                                 of target subroutine
//
// Outputs:
//  none
//                              
// Action:
//  Compiles a program counter relative call onto the end of the current compile
//   buffer.
//  The offset in the call is calculated to call the subroutine target offset
//   bytes from the beginning of the current compile buffer. This offset must
//   fit into a signed 32 bit integer. Be nice if code could automatically do
//   larger displacements...
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecalladdress
//
// C prototype: 
//  void dg_compilecalladdress (
//   Bufferhandle* pBHarrayhead,
//   UINT64 addr)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//  UINT64        addr            target address of routine
//                                 this is an absolute address
//
// Outputs:
//  none
//                              
// Action:
//  Compiles an absolute call address onto the end of the current compile buffer.
//
// Note:
//  Don't use this routine to compile calls to addresses in buffers that can move.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecallfunctblfunction
//
// C prototype: 
//  void dg_compilecallfunctblfunction (
//   Bufferhandle* pBHarrayhead,
//   UINT64 functionindex)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
//  UINT64 functionindex          0 based index into the DiaperGlu function table
//                                 the base address of this table is an element of
//                                 the structure pointed to by pBHarrayhead.
//
// Outputs:
//  none
//
// Action:
//  Compiles a call to a DiaperGlu function table function.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdocompiletypecall
//
// C prototype: 
//  void dg_forthdocompiletypecall (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//
// Stack Action Shorthand:
//  ( dataoffset databufid -- )
//
// Data stack in:
//  UINT64  dataoffset            target offset of call
//  UINT64  databufid             target buffer of call
//
// Action:
//  If databufid is DG_CORE_BUFFERID then
//   Compiles an absolute call address onto the end of the current compile buffer.
//  If databufid is the current compile buffer then
//   Compiles a relative call to the offset onto the end of the current compile 
//   buffer. In otherwords, this does:  dataoffset O CALL,
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_packmodrslashm
//
// C prototype: 
//  unsigned char dg_packmodrslashm (
//   UINT64 mode,
//   UINT64 reg1orn,
//   UINT64 reg2ormem)
//
// Inputs:
//  UINT64        mode                  0-3
//  UINT64        reg1orn               register or opcode extension 0-7
//  UINT64        reg2ormem             register 0-7
//
// Outputs:
//  unsigned char   return              packed byte  containing mod R/N R/M 
//                                                               -- --- ---
//  
// Action:
//  packs mode, reg1orn, reg2ormem into a modr/m byte
//
// Notes:
//  al, ax, eax = 0;
//  cl, cx, ecx = 1;
//  dl, dx, edx = 2;
//  bl, bx, ebx = 3;
//  ah, sp, esp = 4;
//  ch, bp, ebp = 5;
//  dh, si, esi = 6;
//  bh, di, edi = 7;
// 
// Failure cases:
//  none
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_packsib
//
// C prototype: 
//  unsigned char dg_packsib (
//   UINT64 basereg,
//   UINT64 indexscalecode,
//   UINT64 indexreg)
//
// Inputs:
//  UINT64        basereg              0-7
//  UINT64        indexscalecode       0-3     
//  UINT64        indexreg             0-7
//
// Outputs:
//  unsigned char   return              packed byte containing  scale index base
//                                                                --   ---  ---
//
//  
// Action:
//  packs basereg, indexscalecode, and indexreg into a sib byte
//
// Notes:
//  eax = 0;
//  ecx = 1;
//  edx = 2;
//  ebx = 3;
//  esp = 4;
//  ebp = 5;
//  esi = 6;
//  edi = 7;
//
//  indexscalecode
//   times 1 = 0
//   times 2 = 1
//   times 4 = 2
//   times 8 = 3
// 
// Failure cases:
//  none
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_packrex
//
// C prototype: 
//  unsigned char dg_packrex (
//   UINT64 size,
//   UINT64 regr,
//   UINT64 regx,
//   UINT64 regb)
//
// Inputs:
//  UINT64        size                 data size of the instruction in bytes
//  UINT64        regr                 target register
//  UINT64        regx                 scale register
//  UINT64        regb                 base register
//
// Outputs:
//  unsigned char   return              packed byte containing  REX
//
//  
// Action:
//  packs size, regr, regx, and regb into a REX byte
//  REX = 0x40 ored with:
//   8 if data size of the instruction is 64 bits
//   4 if target register is a REX register (R8 - R15)
//   2 if scale register is a REX register (R8 - R15)
//   1 if scale register is a REX register (R8 - R15)
// 
// Failure cases:
//  none
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilealignretstack
//
// C prototype:
//  void dg_compilealignretstack(
//   Bufferhandle* pBHarrayhead,
//   UINT64 numberofparameters)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array where  
//                                     the other bufferhandles are stored.
//  UINT64        numberofparameters  number of parameters to push to the return  
//                                     stack for the following subroutine call
//                                                          
// Action:
//  Compiles code to align the return stack for a subroutine call.
//
// Note:
//  This is for Mac OS X compatibility
//   The Mac OS X operating system uses x86 instructions that require 16 byte
//    alignment.
//   It is also possible that gcc compiled code requires 16 byte alignment. 
//   The Mac OS X documentation requires all subroutine calls are 16 byte aligned
//    at the point of the subroutine call. (This is after the parameters have
//    been pushed to the return stack.)
//   However, calling assembler functions that do not use the x86 instructions
//    requiring alignment without aligning the return stack does seem to work.
//   Tests seem to indicate gcc compiled code does not force aligment and seems
//    to make assumptions about alignment of parent routines calling gcc compiled
//    code when gcc generates it's aligment code. In short... on 64bit x86 systems
//    always keep everything 64bit aligned even if you don't directly call any
//    64bit OS functions, because those 32bit libraries you are calling may call
//    a 64bit os function and will assume they were 16 aligned when you called
//    them.
//
//  See: 
//   ADC Home>Reference Library>Guides>Tools>Compiling>Debugging>
//   Mac OS X ABI Function Call Guide>IA-32 Function Calling Conventions
//   (document existed at this location on Mac developer website July, 2009)
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcompilealignretforn ( COMPILE-ALIGN-RET-FOR-N
//                               COMPILE-ALIGN-RET-FOR-N-PRESERVE-FLAGS )
//
// C prototype:
//  void dg_forthcompilealignretforn (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( n -- )
//
// Data Stack In:
//  n                             number of 64 bit units in the parameter list of
//                                 the next subroutine call
//                                 Most parameter types like bool, char, UINT16
//                                 are padded to 64 bits. Those that are more
//                                 than 64 bits like UINT128 are
//                                 usually 128 bits and are worth 2 64 bit units.
//
// Action: 
//  Compiles code to align the data stack for a subroutine call.
//
//  Note:
//   Compiled code does not alter any registers except the flags and the
//    return stack pointer.
//   Compiled code requires the Diaperglu standard frame to work. This means
//    you have to call COMPILE-ENTER-FRAME at the start of a CODE routine and
//    COMPILE-EXIT-FRAME at each exit to compile the code to enter and exit
//    the standard frame.
//   If Diaperglu is running on a 64 bit x86 processor and calls operating
//    system functions, the process may crash if the return stack is not
//    aligned. Some 64 bit operating systems require 32 byte aligment of the
//    return stack at the point of a subroutine call.
//   You may also run into this problem if you are running under an OS
//    emulator which is running on a 64 bit processor.
//   Using this aligment code fixes the problem.
//   In 64 bit mode, the first 6 parameters are passed in registers,
//    after that, parameters are passed on the return stack.
//
// Example:
//   $" libdiaperglu.dylib" LOADLIBRARY$
//
//   CODE callforthfromcode
//     COMPILE-ENTER-FRAME
//
//     1 COMPILE-ALIGN-RET-FOR-N
//     GETPBUFFERHANDLEARRAYHEAD N  RDI MOV,
//     ' dg_forthswap >BODY N  RAX  MOV,
//     RAX CALL,
//
//     COMPILE-EXIT-FRAME
//   END-CODE
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushpBHarrayheadtoret
//
// C prototype: 
//  void dg_compilepushpBHarrayheadtoret (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//
// Action:
//  This function assumes you are using the DiaperGlu standard frame for the
//   subroutine you are compiling.
//  In 32 bit mode, parameters were passed on the return stack. But in 64 bit
//   mode, the first 6 parameters are passed in registers. This function assumes
//   you are calling a subroutine where pBHarrayhead will be the first parameter.
//   This means pBHarrayhead will be passed in RDI.
//   This function compiles code to copy pBHarrayhead from where it was saved
//    in the standard frame during COMPILE-ENTER-FRAME to the RDI register.
//   Yes, I have not updated the name of this function to reflect how it works
//    in the 64 bit version of DiaperGlu yet.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecallcore
//
// C prototype: 
//  void dg_compilecallcore (
//   Bufferhandle* pBHarrayhead,
//   UINT64 addr)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
//  UINT64        addr            address of DiaperGlu script routine in core to call
//
// Outputs:
//  none
//
// Action:
//  Compiles a call to a DiaperGlu script routine. Since this function compiles
//   a call to an absolute address, the target DiaperGlu script routine must be
//   in a buffer that will not move, in a loaded shared library, or in the
//   loaded DiaperGlu executable.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecallftcolon
//
// C prototype: 
//  void dg_compilecallftcolon (
//   Bufferhandle* pBHarrayhead,
//   UINT64 n)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
//  UINT64        n               index of DiaperGlu script routine in DiaperGlu
//                                 function table to call
//
// Outputs:
//  none
//
// Action:
//  Compiles a call to a DiaperGlu script routine at an index in the
//   DiaperGlu function table. This lets you call certain DiaperGlu functions by
//   number. Right now there are only 2 functions in the function table that
//   the pointer points to at startup: (v4.1)
//   0 = dg_pushdatastack
//   1 = dg_forthdup
//  Looks like I didn't get far with this. I think glu/nglu linker replaced what
//   this function was going to do. The idea was you could use links for DiaperGlu
//   script functions, but compiling pushing numbers to the data stack
//   and compiling words like $" would be easier to do through a function table
//   than a link."
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushdatastack
//
// C prototype: 
//  void dg_compilepushdatastack (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
// Outputs:
//  none
//
// Action:
//  Compiles a call to dg_pushdatastack(pBHarrayhead, n)
//  The compiled code expects to be executed in a subroutine that is using the
//   DiaperGlu standard frame.
//  The compiled code also expects n to be on top of the return stack
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushntodatastack
//
// C prototype: 
//  void dg_compilepushntodatastack (
//   Bufferhandle* pBHarrayhead,
//   UINT64 n)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
//  UINT64        n               value to push to the data stack.
//
// Outputs:
//  none
//
// Action:
//  Compiles a call to dg_pushdatastack(pBHarrayhead, n)
//  The compiled code expects to be executed in a subroutine that is using the
//   DiaperGlu standard frame.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// /////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilobtoptodatastack
//
// C prototype: 
//  dg_compilobtoptodatastack (
//   Bufferhandle* pBHarrayhead,
//   UINT64 bufferid,
//   UINT64 offset)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the
//                                 other bufferhandles are stored.
//
//  UINT64        bufferid        index of the bufferhandle in the BHarray
//
//  UINT64        offset          0 based offset in bytes from start of buffer
//
// Outputs:
//  none
//
// Action:
//  Compiles code to push the address of an offset in a buffer to the datastack.
//  The code calculates the address when the compiled code is executed using
//   the offset and bufferid.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileinitlocals ( COMPILE-ENTER-FRAME ENTER-DGLU-FRAME, )
//
// C prototype:
//  void dg_compileinitlocals (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//                                                          
// Action:
//  Compiles code to set up the standard Diaperglu frame on the return stack.
//
// Notes:
//  Stack frame on Mac is:
//   param n
//   ...
//   param 7               +0x18
//   param 6               +0x10
//   ret                   +0x08
//   saved rbp             +0x00
//   saved flags           -0x08
//   pBHarrayhead          -0x10
//   old error count       -0x18
//   local storage marker  -0x20  (change this to mark where subroutine's
//                                  return stack stuff is)
//
//  Stack frame on Windows is:
//   param n
//   ...
//   param 5               +0x38
//   param 4               +0x30
//   shadow 3              +0x28
//   shadow 2              +0x20
//   shadow 1              +0x18
//   shadow 0              +0x10
//   ret                   +0x08
//   saved rbp             +0x00
//   saved flags           -0x08
//   pBHarrayhead          -0x10
//   old error count       -0x18
//   local storage marker  -0x20  (change this to mark where subroutine's
//                                  return stack stuff is)
//
//  How this frame works:
//   64 bit operating systems require
//    32 byte return stack alignment at the point of the subroutine call.
//    This frame allows quick and easy alignment of the return stack by use of
//    the local storage marker. Instead of having to restore the return stack
//    after a subroutine call, the alignment code can just drop everything back
//    to the local storage marker and adjust for alignment from there.
//    Since the alignment code cleans up the return stack, it doesn't matter if
//    the called subroutine cleans up the return stack or not, the call is
//    handled the same way.
//    It does mean there is a little extra work to setting up local storage on
//    the return stack. With this frame instead of just pushing things to the
//    return stack, you also need to adjust the local storage marker.
//
// Example of using the Diaperglu standard frame:
//  HEX
//  CODE mysub
//   COMPILE-ENTER-FRAME
//
//   28 N  RSP SUB,              // might need to do this first because  an
//                               //  interrupt could overwrite your data...
//                               //  but isn't there supposed to be a guard zone?
//                               //  wouldn't hurt to do this...'
//
//   28 N  RBP -20 [R+N]  SUB,   // allocate 0x28 bytes on the local frame
//
//   1122334455667788 N  RAX  MOV,
//   RAX  RBP -48 [R+N] -> MOV,  // store a value in last of the local variables
//
//   1 COMPILE-ALIGN-RET-FOR-N
//   EBP -48 [R+N]  RDI -> MOV,      // use local variable value as first
//                                       parameter
//   ' myfirstlibsub >BODY N  RAX  MOV,  // assumes fixed address subroutine
//   RAX CALL,
//
//   // no need to clean up return stack
//
//   1 COMPILE-ALIGN-RET-FOR-N
//   EBP -48 [R+N]  RDI -> MOV,      // use local variable value as first
//                                       parameter again
//   ' mysecondlibsub >BODY N  RAX  MOV,  // assumes fixed address
//   RAX CALL,
//
//   // no need to clean up return stack
//
//   // calling a DiaperGlu Script function:
//   1 COMPILE-ALIGN-RET-FOR-N
//   EBP -10 [R+N]  RDI -> MOV,  // move pBHarrayhead to RDI
//   ' SWAP >BODY N RAX MOV,
//   RAX CALL,
//
//   // no need to clean up return stack
//
//
//   // calling a subroutine in the same buffer that was compiled using CODE
//   //  that uses a parameter from the frame
//   RBP -48 [R+N]  RDI  ->  MOV,
//   myCODEsubroutinename
//
//   // calling a subroutine in the same buffer that was compiled using OCODE
//   //  that uses a parameter from the frame
//   RBP -48 [R+N]  RDI  ->  MOV,
//   myOCODEsubroutinename DROP O CALL,
//
//   COMPILE-EXIT-FRAME
//  END-CODE
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compileexitlocals   
//  ( COMPILE-EXIT-FRAME EXIT-DGLU-FRAME, EXIT-CALL-SUBS-FRAME, )
//
// C prototype:
//  void dg_compileexitlocals(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     the other used as the bufferhandle for the  
//                                     array where  bufferhandles are stored.
//
// Action:
//  Compiles code to exit a subroutine's frame and return
//   Exiting a subroutine and it's frame involves
//   1) popping the saved flags register from the return stack
//   2) restoring the frame pointer register to the parent subroutine's frame
//   3) returning from the subroutine
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthexitcallsubsframenoretcomma   ( EXIT-CALL-SUBS-FRAME-NO-RET, )
//
// C prototype:
//  void dg_forthexitcallsubsframenoretcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     the other used as the bufferhandle for the  
//                                     array where  bufferhandles are stored.
//
// Action:
//  Compiles code to exit a subroutine's frame but not return
//   Exiting a subroutine and it's frame involves
//   1) popping the saved flags register from the return stack
//   2) restoring the frame pointer register to the parent subroutine's frame
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthentercallsubsframecomma ( ENTER-CALL-SUBS-FRAME, )
//
// C prototype:
//  void dg_forthentercallsubsframecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//                                                          
// Action:
//  Compiles code to set up a frame on the return stack that supports the flat
//   .o buf helper words that help do local variables and compile calling
//   subroutines. 
//
// Notes:
//  Stack frame on Mac is:
//   param n
//   ...
//   param 7               +0x18
//   param 6               +0x10
//   ret                   +0x08
//   saved rbp             +0x00
//   saved flags           -0x08
//   copy of param 0       -0x10
//   copy of param 1       -0x18
//   local storage marker  -0x20  (change this to mark where subroutine's
//                                  return stack stuff is)
//
//  Stack frame on Windows is:
//   param n
//   ...
//   param 5               +0x38
//   param 4               +0x30
//   shadow 3              +0x28
//   shadow 2              +0x20
//   shadow 1              +0x18
//   shadow 0              +0x10
//   ret                   +0x08
//   saved rbp             +0x00
//   saved flags           -0x08
//   copy of param 0       -0x10
//   copy of param 1       -0x18
//   local storage marker  -0x20  (change this to mark where subroutine's
//                                  return stack stuff is)
//
//  How this frame works:
//   64 bit operating systems require
//    32 byte return stack alignment at the point of the subroutine call.
//    This frame allows quick and easy alignment of the return stack by use of
//    the local storage marker. Instead of having to restore the return stack
//    after a subroutine call, the alignment code can just drop everything back
//    to the local storage marker and adjust for alignment from there.
//    Since the alignment code cleans up the return stack, it doesn't matter if
//    the called subroutine cleans up the return stack or not, the call is
//    handled the same way.
//    It does mean there is a little extra work to setting up local storage on
//    the return stack. With this frame instead of just pushing things to the
//    return stack, you also need to adjust the local storage marker.
//
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepusholderrorcounttoret ( COMPILE-OLDERRORCOUNT>RET )
//
// C prototype:
//  void dg_compilepusholderrorcounttoret (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//                                                          
// Action:
//  Compiles code to push the old error count to the return stack.
//  This is the error count at the time the current subroutine was entered.
//  This functions assumes the current subroutine is using the DiaperGlu standard
//   frame.
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilequeryerror  ( COMPILE-QUERYERROR )
//
// C prototype:
//  void dg_compilequeryerror (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//                                                          
// Action:
//  Compiles code to compare the current error count with the old error count saved
//   from when the subroutine was entered, then compiles an unresolved branch that
//   is taken if the two error counts were equal.
//
// Note:
//  The unresolved branch is usually resolved with a THEN. In the script this 
//  looks like:
//   ?ERRORIF dostuff THEN
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcompilealignretfornpf ( COMPILE-ALIGN-RET-FOR-N-PRESERVE-FLAGS )
//
// C prototype:
//  void dg_forthcompilealignretfornpf (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( n -- )
//
// Data Stack In:
//  n                             number of 64 bit units in the parameter list of
//                                 the next subroutine call
//                                 Most parameter types like bool, char, UINT16
//                                 are padded to 64 bits. Those that are more
//                                 than 64 bits like UINT128 are
//                                 usually 128 bits and are worth 2 64 bit units.
//
// Action: 
//  Compiles code to align the data stack for a subroutine call.
//
//  Note:
//   Compiled code does not alter any registers except the flags and the
//    return stack pointer.
//   Compiled code requires the Diaperglu standard frame to work. This means
//    you have to call COMPILE-ENTER-FRAME at the start of a CODE routine and
//    COMPILE-EXIT-FRAME at each exit to compile the code to enter and exit
//    the standard frame.
//   If Diaperglu is running on a 64 bit x86 processor and calls operating
//    system functions, the process may crash if the return stack is not
//    aligned. Some 64 bit operating systems require 32 byte aligment of the
//    return stack at the point of a subroutine call.
//   You may also run into this problem if you are running under an OS
//    emulator which is running on a 64 bit processor.
//   Using this aligment code fixes the problem.
//   In 64 bit mode, the first 6 parameters are passed in registers,
//    after that, parameters are passed on the return stack.
//
// Example:
//   $" libdiaperglu.dylib" LOADLIBRARY$
//
//   CODE callforthfromcode
//     COMPILE-ENTER-FRAME
//
//     1 COMPILE-ALIGN-RET-FOR-N
//     GETPBUFFERHANDLEARRAYHEAD N  RDI MOV,
//     ' dg_forthswap >BODY N  RAX  MOV,
//     RAX CALL,
//
//     COMPILE-EXIT-FRAME
//   END-CODE
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_showframe   
//
// C prototype:
//  void dg_showframe (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array where 
//                                     the other bufferhandles are stored.
//                                                          
// Action:
//  sends a message to stdout showing the saved registers and values in the return
//  stack frame of the subroutine that called this function
// 
// Failure cases:
//  none
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushregtoret   
//
// C prototype:
//  void dg_compilepushregtoret (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead      pointer to a Bufferhandle structure which is
//                                   used as the bufferhandle for the array where
//                                   the other bufferhandles are stored.
//  UINT64        reg                 
//                                                          
// Action:
//  Compiles code to push a non REX 64 bit register to the return stack.
//
// Note:
//  If reg < dg_rax from the dg_cpux86regs enum in diaperglu.h
//   then the REX byte for a 64 bit data width is used otherwise
//   the REX byte is not used. Note that REX is not needed to specify
//   a 64 bit data size for the x86 push instruction in 64 bit mode.
//  In any case the reg value passed in is anded with 7 to determine
//   which register is pushed.
//
//  reg&7  register
//  0          RAX
//  1          RCX
//  2          RDX
//  3          RBX
//  4          RSP
//  5          RBP
//  6          RSI
//  7          RDI
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepopregfromret   
//
// C prototype:
//  void dg_compilepopregfromret (
//   Bufferhandle* pBHarrayhead,
//   UINT64 reg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is
//                                     used as the bufferhandle for the array  
//                                     where the other bufferhandles are stored.
//  UINT64        reg                 
//                                                          
// Action:
//  Compiles code to pop a non REX 64 bit register from the return stack.
//
// Note:
//  If reg < dg_rax from the dg_cpux86regs enum in diaperglu.h
//   then the REX byte for a 64 bit data width is used otherwise
//   the REX byte is not used. Note that REX is not needed to specify
//   a 64 bit data size for the x86 push instruction in 64 bit mode.
//  In any case the reg value passed in is anded with 7 to determine
//   which register is popped.
//
//  reg&7  register
//  0          RAX
//  1          RCX
//  2          RDX
//  3          RBX
//  4          RSP
//  5          RBP
//  6          RSI
//  7          RDI
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilebranch   
//
// C prototype:
//  UINT64 dg_compilebranch (
//   Bufferhandle* pBHarrayhead,
//   UINT64 branchtype)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array  
//                                     where the other bufferhandles are stored.
//  UINT64        branchtype          branch type code
//
// Outputs:
//  UINT64        return              the after branch offset in bytes of the 
//                                     compiled branch in the current compile
//                                     buffer
//                                    or the 64 bit value of -1 if the branch
//                                     type was never
//                                                          
// Action:
//  Compiles an unresolved branch on a condition for all branch types except
//   never.
//  If branch type never nothing is compiled.
//  Uknown branch types are treated as branch type never.
//  
//
// Notes:
//  branch types, branch on:
//      0         overflow
//      1         no overflow
//      2         u< or carry set
//      3         u>= or carry clear
//      4         = or zero
//      5         != or not zero
//      6         u<=
//      7         u>
//      8         minus
//      9         plus
//   0x0a         parity even  ( dont think all platforms will support this )
//   0x0b         parity odd
//   0x0c         signed <
//   0x0d         signed >=
//   0x0e         signed <=
//   0x0f         signed >
// 
//   0x10         always
//   0x11         never (compile nothing)
//
//  Anything else is treated as branch never and nothing is compiled.
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_resolvecompiledbranch
//
// C prototype:
//  void dg_resolvecompiledbranch(
//   Bufferhandle* pBHarrayhead, 
//   UINT64 afterbranchoffset,
//   UINT64 targetoffset)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array where 
//                                     the other bufferhandles are stored.
//  UINT64        afterbranchoffset   offset in bytes from the beginning of the
//                                     current compile buffer immediately after
//                                     the compiled branch instruction
//  UINT64        targetoffset        branch target offset in bytes from the 
//                                     beginning of the current compile buffer  
//                                                          
// Action:
//  Calculates the offset for the previously compiled relative branch and updates
//   the compiled branches' instruction code.
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_resolvecompiled8bitbranch
//
// C prototype:
//  void dg_resolvecompiled8bitbranch(
//   Bufferhandle* pBHarrayhead, 
//   UINT64 afterbranchoffset,
//   UINT64 targetoffset)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array where 
//                                     the other bufferhandles are stored.
//  UINT64        afterbranchoffset   offset in bytes from the beginning of the
//                                     current compile buffer immediately after
//                                     the compiled branch instruction
//  UINT64        targetoffset        branch target offset in bytes from the 
//                                     beginning of the current compile buffer  
//                                                          
// Action:
//  Calculates the offset for the previously compiled relative branch and updates
//   the compiled branches' instruction code.
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilecompare   
//
// C prototype:
//  void dg_compilecompare(
//   Bufferhandle* pBHarrayhead, 
//   UINT64 n)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is
//                                     used as the bufferhandle for the array  
//                                     where the otherbufferhandles are stored.
//  INT64         n                   signed 32 bit integer constant value to compare
//                                                          
// Action:
//  Compiles code to compare the designated compare register with a constant value.
//
// Note:
//  On the x86, the compare register is rax because rax is used as the return
//   value from cdecl subroutines.
//  Also, the value n must fit into a signed 32 bit integer because that's the
//   largest size available for the x86 compare immediate instruction.'
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  n does not fit into a signed 32 bit integer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepushntoret   
//
// C prototype:
//  UINT64 dg_compilepushntoret (
//   Bufferhandle* pBHarrayhead,
//   UINT64 n)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array  
//                                     where the other bufferhandles are stored.
//  UINT64        n                   constant value
//
// Output:
//  UINT64        return              the buffer offset in bytes of the compiled 
//                                     UINT64 n data in the current compile buffer
//                                                          
// Action:
//  Compiles code to push a constant UINT64 value to the return stack
//
// Note:
//  For values of n that do not fit into a signed 32 bit integer, two instructions
//   are compiled: a push, and a move. The move does the high part of n.
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilentoparameter
//
// C prototype:
//  UINT64 dg_compilentoparameter (
//   Bufferhandle* pBHarrayhead,
//   UINT64 n,
//   UINT64 parameter)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array  
//                                     where the other bufferhandles are stored.
//  UINT64        n                   UINT64 constant value
//  UINT64        parameter           0 based where 0 means the first parameter
//                                                          
// Action:
//  If parameter is less than 6,
//   this compiles code to move n to a parameter register.
//    parameter   register
//     0           RDI
//     1           RSI
//     2           RDX
//     3           RCX
//     4           R8
//     5           R9
//
//  If parameter is greater or equal to 6,
//   the parameter value is ignored and
//   this function compiles code to push n to the return stack.
//   This means you have to compile code to push the parameters
//   greater than or equal to 6 in reverse order and call this function
//   at the right time.
// 
// Failure cases:
//  current compile buffer id is for a nonexistent invalid buffer
//  out of memory when trying to grow current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcompilesafecallbuffer
//
// C function prototype:
//  void dg_forthcompilesafecallbuffer (Bufferhandle* pBHarrayhead) 
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//  ( bufferoffset bufferid -- )
//   
// Data stack in:
//  bufferid                      buffer the target routine is in
//  bufferoffset                  offset of the target routine in the buffer
//                                                            
// Action:
//  Compiles a safe call to a subroutine at an offset in a target buffer 
//   onto the end of the current compile buffer. 
//  If the buffer the call was made from moves,
//   this call will return to the correct place.
//
//  Notes:
//   Compiles:
//    push offset in buffer of return
//    push id of buffer of return
//    push pBHarrayhead
//     (return stack needs to be 16 byte aligned here
//      in order to meet mac os x requirements)
//    push address of jump to offset in buffer code
//     (this is the address for the subroutine return)
//     (address calculated when this code compiled)
//
//    (the above is the return stack parameters and
//      return address passed to the called routine)
//
//    push offset in buffer of target routine
//    push id of buffer in target routine
//    push pBHarrayhead
//     (return stack needs to be 16 byte aligned here
//      for the call to dg_getpbufferoffset)
//    jmp to address of jump to offset in buffer code 
//     (address calculated when this code compiled)
//
//  Because address of the jump to buffer routine is
//   calculated when this code is compiled,
//   the address of the jump to buffer code can not change.
/    This won't happen unless you want
//   to save the compiled code into a file and reload it later.
//
// What happens during a safe call:
//  safe call occurs from a standard dglu subroutine frame
//  the return stack is aligned for a 3 parameter call
//  then 3 parameters are pushed to the return stack for a call
//   to dg_getpbufferoffset which will calculate the true return address
//  then the address of the jump to buffer code is pushed to
//   the return stack, when the called routine returns, the return will
//   go to the jump to buffer code which will calculate the
//   true return address and jump to it
//  then the control flow jumps to the target routine at its offset in
//   its buffer
//  
//  return stack looks like this:
//    align space
//    return offset
//    return bufferid
//    pBHarrayhead
//     - 16 byte aligned here
//    addr of jumptobuffer code
//    target offset
//    target bufferid
//    pBHarrayhead
//     - 16 byte aligned here (nice how that worked out)
//
// Possibility:
//  Another way to do this is to compile code
//   that calculates the address of the jump to buffer
//   routine when the compiled code runs.
//  Then the compiled safe call could be used in a shared
//   library without run time linking or binding.
//  Of course this still assumes the buffer ids and
//   offsets of the called routines do not change.
// 
// Failure cases:
//  error popping the bufferoffset and bufferid from the data stack
//  error getting the current compile buffer id
//  error growing the current compile buffer
//  error getting the pointer to the current compile buffer
//  offset is off the end of the buffer
//  subroutine is too far away to call,
//   > 0x80000000 away in the current compile buffer
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_initjumpbuffer   
//
// C prototype:
//  void dg_initjumpbuffer (
//   Bufferhandle* pBHarrayhead, 
//   char* jumpbufferstring)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure which is 
//                                     used as the bufferhandle for the array 
//                                     where  the other bufferhandles are stored.
//  char*         jumpbufferstring    pointer to memory where jumpbuffer code
//                                     will go
//
// Output:
//  none
//                                                          
// Action:
//  Compiles jumpbuffer code to the memory at jumpbufferstring
//
//  To use the jumpbuffer code you push offset, bufferid,
//   and pBHarrayhead to the return stack
//  Then you jump to the jumpbufferstring address,
//   or return to the jumpbufferstring address
//  The jumpbuffer code calculates a target address
//   using the offset and bufferid passed to it
//   and jumps to the target address
//
// Note:
//  This subroutine assumes the memory at jumpbufferstring is valid.
// 
// Failure cases:
//  none
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_initSibformatter
//
// C prototype:
//  void dg_initSibformatter (struct dg_Sibformatter* psf)
//
// Inputs:
//  struct dg_Sibformatter* psf       pointer to a dg_Sibformatter structure which
//                                     is used to hold the variables needed to
//                                     format the SIB portion of an x86 opcode.
//
// Output:
//  none
//                                                        
// Action:
//  Initializes a dg_Sibformatter structure with the default values.
// 
// Failure cases:
//  data stack underflow
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_bumpdisplacementsizeifneeded   
//
// C prototype:
//  void dg_bumpdisplacementsizeifneeded (dg_Sibformatter* psf)
//
// Inputs:
//  dg_Sibformatter* psf              pointer to a dg_Sibformatter structure
//                                     which holds a description of an assembler
//                                     instruction target
//
// Output:
//  none
//
// Action:
//  Displacements are either 0, 1, or 4 bytes.
//   This routine upgrades the displacement size
//   to a higher value if it currently can't hold the displacement value. 
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_setmod   
//
// C prototype:
//  void dg_setmod (dg_Sibformatter* psf)
//
// Inputs:
//  dg_Sibformatter* psf              pointer to a dg_Sibformatter structure which 
//                                     holds a description of an assembler
//                                     instruction target
//
// Output:
//  none
//
// Action:
//  Upgrades the displacement size for the [ebp+0] addressing mode case if needed,
//  then calculates the mod value of the modr/m byte based on the
//   displacement size.
// 
// Failure cases:
//  
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_formatsib
//
// C prototype: 
//  void dg_formatsib (
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array  
//                                 where the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
// Outputs:
//  none
//                              
// Action:
//   Sets the psf structure up for the sib addressing mode and calculates
//    the rslashm, mod, basereg, and indexreg.
//    Also upgrades the displacement size if needed.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_formatmodrslashm
//
// C prototype: 
//  void dg_formatmodrslashm (
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler 
//                                 instruction target.
//
// Outputs:
//  none
//                              
// Action:
//   Sets the psf structure up for the modr/m mode and calculates the rslashm, 
//    and mod if needed. If the modr/m mode can't support the addressing mode, 
//    then dg_formatsib is called to set up the SIB addressing mode.
//
// Note:
//  [n] is also promoted to SIB.
//
// Note:
//   In 32 bit x86 mode, modr/m can do [n], however, this is currently promoted 
//     to SIB.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_formatreg
//
// C prototype: 
//  void dg_formatreg(
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other  bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
// Outputs:
//  none
//                              
// Action:
//   Sets the psf structure up for a reg target using modr/m addressing in 
//   register mode. mod is set to 3. rslashm is set to basereg.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_calculatemodrslashm
//
// C prototype: 
//  unsigned char dg_calculatemodrslashm (dg_Sibformatter* psf)
//
// Inputs:
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler instruction
//                                 target.
//
// Outputs:
//  return  unsigned char         modr/m byte
//                              
// Action:
//   Calculates the modr/m byte from the values in the psf structure.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_calculatesib
//
// C prototype: 
//  unsigned char dg_calculatesib (dg_Sibformatter* psf)
//
// Inputs:
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
// Outputs:
//  return  unsigned char         modr/m byte
//                              
// Action:
//   Calculates the sib byte from the values in the psf structure.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_getsizefromreg
//
// C prototype: 
//  UINT64 dg_getsizefromreg (UINT64 reg)
//
// Inputs:
//  UINT64  reg                   Diaperglu id code for an x86 register
//
// Outputs:
//  return  UINT64                the size of the register in bytes,
//                                 either 1, 2, or 4
//                              
// Action:
//   Calculates the size in bytes of an x86 register.
//
// Note:
//   The value of reg is from the dg_cpux86regs enum in diapergluforth.h
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_checkbasereg
//
// C prototype: 
//  const char* dg_checkbasereg (UINT64 reg)
//
// Inputs:
//  UINT64  reg                   Diaperglu id code for an x86 register
//
// Outputs:
//  return  const char*           dg_success if reg is a valid base register
//                                an error message if reg is not a valid base
//                                 register
//                              
// Action:
//   Checks the register passed in to determine if it is a valid base register
//
// Note:
//  in 32 bit mode, base reg must be one of EAX, EBX, ECX, EDX, EBP, ESP, EDI, ESI
//  in 64 bit mode, base reg must be one of RAX, RBX, RCX, RDX, RBP, RSP, RDI, RSI,
//      R8, R9, R10, R11, R12, R13, R14, or R15
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_checkindexreg
//
// C prototype: 
//  const char* dg_checkindexreg (UINT64 reg)
//
// Inputs:
//  UINT64  reg                   Diaperglu id code for an x86 register
//
// Outputs:
//  return  const char*           dg_success if reg is a valid index register
//                                an error message if reg is not a valid index
//                                 register
//                              
// Action:
//   Checks the register passed in to determine if it is a valid index register
//
// Note:
//  in 32 bit mode, index reg must be one of EAX, EBX, ECX, EDX, EBP, EDI, ESI
//  in 64 bit mode, index reg must be one of RAX, RBX, RCX, RDX, RBP, RDI, RSI,
//      R8, R9, R10, R11, R12, R13, R14, or R15
//  It looks like this routine will not return an error if you specify ESP/RSP
//   as an index reg at this time
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_pullmemusingsib
//
// C prototype: 
//  void dg_pullmemusingsib (
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
//
// Stack action shorthand:
//   ( basereg scale indexreg displacement mindisplacementsize -- )
//
// Data stack in:
//   basereg                      dg_cpux86regs enum code for a register
//                                 in 32 bit mode this must be a 32 bit register
//                                 in 64 bit mode this must be a 64 bit register
//   scale                        0, 1, 2, or 3 which encodes for
//                                 *1, *2, *4, or *8
//   indexreg                     dg_cpux86regs enum code for a register
//                                 in 32 bit mode this must be a 32 bit register
//                                 in 64 bit mode this must be a 64 bit register
//   displacement                 signed displacement in bytes
//   mindisplacementsize          minimum size in bytes of the displacement
//                                 can be 0, 1, or 4.
//                                 specify 0 to use the smallest possible size
//
// Outputs:
//  none
//                              
// Action:
//   Pops values for a sib target and uses them to fill out the psf
//    structure.
//
// Note:
//   Sib memory addressing supports:
//    [basereg + scale*indexreg + displacement]
//
//   indexreg can not be ESP
//   basereg and/or indexreg can be NOREG
//   displacement can be 0
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_pullmemusingrslashm
//
// C prototype: 
//  void dg_pullmemusingrslashm (
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where  
//                                 the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
//
// Stack action shorthand:
//   ( basereg displacement mindisplacementsize -- )
//
// Data stack in:
//   basereg                      dg_cpux86regs enum code for a register
//                                 in 32 bit mode this must be a 32 bit register
//                                 in 64 bit mode this must be a 64 bit register
//   displacement                 signed displacement in bytes
//   mindisplacementsize          minimum size in bytes of the displacement
//                                 can be 0, 1, or 4.
//                                 specify 0 to use the smallest possible size
//
// Outputs:
//  none
//                              
// Action:
//   Pops values for an modr/m memory target and stuffs uses them to fill out 
//    the psf structure.
//
// Note:
//   Modr/m memory addressing supports [basereg + displacement]
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_getcallsubsframepreservedregoffset
//
// C prototype: 
//  INT64 dg_getcallsubsframepreservedregoffset(
//    Bufferhandle* pBHarrayhead,
//    UINT64 regpreservedpos)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where  
//                                 the other bufferhandles are stored.
//
//  UINT64 regpreservedpos        0 based which preserved reg this is
//
// Outputs:
//  none
//                              
// Action:
//   Calculates the rbp relative call subs frame offset for the preserved reg.
//     regpreservedpos 0 is the first preserved reg and corresponds to the lowest
//     bit set in the value in PPRESERVED-RMASK. regpreservedpos 1 is the
//     second preserved reg. This function uses the value in PPRESERVED-DEPTH
//     to do the calculation.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_pulloneaddressingmode
//
// C prototype: 
//  void dg_pulloneaddressingmode(
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where  
//                                 the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
//
// Stack action shorthand:
//   ( ... parameterlist -- )
//
// Data stack in:
//   Parameter list for one target.
//
// Outputs:
//  none
//                              
// Action:
//   Pops the parameter list for one target and uses the parameters to fill out
//    the psf structure.
//   Instruction direction size and direction commands are popped until a target
//    is popped. Once a target is popped, this command exits.
//
// Notes:
//   Parameter list supports these types:
//     dg_isdatasize  ( DATASIZE )
//      sets the data size of the instruction in bytes
//
//     dg_isforward  ( -> )
//      does nothing because direction forward by default
//      on one target instructions, direction is not used
//
//     dg_isreverse ( <- )
//      sets the direction of the instruction to reverse
//      on one target instructions, direction is not used
//      on some two target instruction modes, direction is not used because
//      either the instruction doesn't have a destination, or the destination
//      assumed to be one of the targets. ( e.g. those with immediate targets )
//      if reverse is used, specifying reverse on either target makes the
//      whole instruction reverse, overriding any forward declarations
//
//     dg_isimmediate ( IMMEDIATE )
//      pops values for an immediate target
//
//     dg_isbasedisplacement ( [MODM] )
//      pops values for a modr/m memory target
//
//     dg_isbasescaleindexdisplacement ( [SIB] )
//      pops values for a sib memory target
//
//     dg_isbasescalevindexdisplacement ( [VSIB] )
//      pops values for a vsib memory target (looks like index reg is ymmr)
//
//     dg_ishereplusdisplacement ( EIP+N RIP+N )
//      pops values for a pc relative instruction
//      (this is currently only supported for CALL,)
//
//     dg_isbufferoffset ( BUFFEROFFSET )
//      pops values for a target at an offset in a buffer
//      but since the buffer id is ignored at this time, this mode acts the
//      same as dg_iscurrentcompilebufferoffset
//
//     dg_iscurrentcompilebufferoffset ( CURRENTCOMPILEBUFFEROFFSET [O] )
//      pops values for an instruction that targets memory at an offset
//       in the current compile buffer
//
//     dg_isccbufferoffsetnobracket ( O )
//      pops values for an instruction that targets an offset in the current
//       compile buffer
//      (this is currently only supported for CALL,)
//
//     dg_isreg ( R )
//      pops id of an x86 reg target
//
//     dg_isfloatingpointstackreg ( FPSR )
//      pops id of an x86 floating point stack reg
//
//     dg_isxmmreg ( XMMR )
//      pops id of an x86 xmm reg
//
//     dg_isymmreg ( YMMR )
//      pops id of an x86 ymm reg
//
//     dg_iscontrolreg ( CR )
//      pops id of an x86 control reg
//
//     dg_issegmentreg ( SR )
//      pops id of an x86 segment reg
//
//     dg_isdebugreg ( DR )
//      pops id of an x86 debug reg
//
//     dg_isthreebytevex ( 3BYTEVEX )
//      sets the 'is using 3 byte vex' flag
//
//     dg_isdgluforthframelocal ( DGLU-FORTH-FRAME-LOCAL )
//      pops value for a [RBP+N] instruction where N = -value*sizeof(UINT64)
//
//     dg_isparamusingnoframe ( NO-FRAME-PARAM )
//      pops value for an instruction referencing a 0 based parameter in a 
//      subroutine not using rbp as the frame pointer
//      0 is the first parameter, 1 is the second parameter, etc.
//      if the parameter is for a reg that was preserved, this references
//       the preserved parameter on the return stack
//
//     dg_isparamusingframe ( FRAME-PARAM )
//      pops value for an instruction referencing a 0 based parameter in a 
//      subroutine using rbp as the frame pointer in a call subs or Diaperglu frame
//      0 is the first parameter, 1 is the second parameter, etc.
//      if the parameter is for a reg that was preserved, this references
//       the preserved parameter on the return stack
//
//     other
//      assumes parameter type is actually an id for an x86 reg target value
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compiledisplacement
//
// C prototype: 
//  void dg_compiledisplacement (
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where  
//                                 the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the psf's displacement to the current compile buffer. The number
//    of bytes compiled is based on the displacementsize element of the psf
//    structure, not the actual displacement value. This is so that the user can
//    control the encoding of the displacement if they wish.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_formatpsf
//
// C prototype: 
//  void dg_formatpsf(
//   Bufferhandle* pBHarrayhead,
//   dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where  
//                                 the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
// Outputs:.
//  none
//                              
// Action:
//   Formats the sib formatter for the instruction's addressing mode'
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilepsf
//
// C prototype: 
//  void dg_compilepsf(
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where  
//                                 the other bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holdsa description of an assembler
//                                 instruction target.
//
// Outputs:
//  none
//                              
// Action:
//   Using the values in the psf structure,
//   compiles the psf's modr/m byte, sib byte if needed, and displacement
//     bytes if needed.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_queryisrega
//
// C prototype: 
//  UINT64 dg_queryisrega(UINT64 regtype)
//
// Inputs:
//  UINT64    regtype            Diaperglu code id for an x86 register
//
// Outputs:
//  UINT64    return             FORTH_TRUE if regtype is for al, ax, eax, or rax
//                               FORTH_FALSE otherwise
//                              
// Action:
//   Checks to see if regtype is one of dg_al, dg_ax, dg_eax, or dg_rax and returns
//    FORTH_TRUE if it is. Returns FORTH_FALSE if it is not.
// 
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_gettargettype
//
// C prototype: 
//  UINT64 dg_gettargettype (
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other  bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//
// Outputs:
//  none
//                              
// Action:
//   Checks the values saved in the psf structure from when it's values where 
//    pulled from the data stack and returns the target specified for one of:
//      an immediate target  e.g. N
//      a default register target that is one of the al, ax, eax, rax  e.g. EAX
//      a register target  e.g. EBX R
//      a memory target  e.g. EDX 5 [R+N] 
//
// Note:
//   There are two ways to specify a register target
//     when Diaperglu is pulling a target from the data stack to fill the psf.
//     If no memory mode is specified,
//     Diaperglu assumes the memory mode is default register and that the value is
//     the code id for a register. E.g. EAX
//     The other way is to use the register mode specifier. E.g. EAX R 
//
// Note:
//   I'm planning to add a MODR one... to force minimum modr encoding
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_determine2targettype
//
// C prototype: 
//  UINT64 dg_determine2targettype (
//    Bufferhandle* pBHarrayhead,
//    dg_Sibformatter* psf1, // top on stack
//    dg_Sibformatter* psf2) // second on stack
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other  bufferhandles are stored.
//
//  dg_Sibformatter* psf1         pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//                                 This holds the first target pulled from the
//                                 stack.
//
//  dg_Sibformatter* psf2         pointer to a dg_Sibformatter structure which 
//                                 holds a description of an assembler
//                                 instruction target.
//                                 This holds the second target pulled from the
//                                 stack.
//
// Outputs:
//  UINT64           return       code for a two target type for second pulled
//                                 to first pulled, which is one of:
//                                  immediate to immediate
//                                  immediate to default register
//                                  immediate to register
//                                  immediate to memory
//                                  default register to immediate
//                                  default register to default register
//                                  default register to register
//                                  default register to memory
//                                  register to immediate
//                                  register to default register
//                                  register to register
//                                  register to memory
//                                  memory to immediate
//                                  memory to default register
//                                  memory to register
//                                  memory to memory
//                              
// Action:
//   Checks the values saved in the two psf structures from when the targets
//     were pulled from the data stack and returns the two target mode.
//
// Note:
//   There are two ways to specify a register target:
//     If no memory mode is specified,
//      Diaperglu assumes the memory mode is default register and that the value
//      is the code id for a register. e.g. EAX
//     The other way is to use the register mode specifier. e.g. EAX R
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilerexnosizetargetreg
//
// C prototype: 
//  void dg_compilerexnosizetargetreg (
//   Bufferhandle* pBHarrayhead,
//   dg_Sibformatter* psf,
//   UINT64 addresssize)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other  bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which
//                                 holds a description of an assembler
//                                 instruction target.
//                                 This holds the first target pulled from the
//                                 stack.
//
//  UINT64 addresssize            address size in bytes
//                                 if you are compiling in 64 bit address mode,
//                                  this is 8
//                                 if you are compiling in 32 bit address mode,
//                                  this is 4
//
// Outputs:
//  none
//                              
// Action:
//   If you are in 64 bit address mode, compiles the rex prefix for an instruction
//    ignoring the data size and target register.
//   Only the base register and index register for the instruction are used.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilerexnotargetreg
//
// C prototype: 
//  void dg_compilerexnotargetreg (
//   Bufferhandle* pBHarrayhead,
//   dg_Sibformatter* psf,
//   UINT64 addresssize)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other  bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which
//                                 holds a description of an assembler
//                                 instruction target.
//                                 This holds the first target pulled from the
//                                 stack.
//
//  UINT64 addresssize            address size in bytes
//                                 if you are compiling in 64 bit address mode,
//                                  this is 8
//                                 if you are compiling in 32 bit address mode,
//                                  this is 4
//
// Outputs:
//  none
//                              
// Action:
//   If you are in 64 bit address mode, compiles the rex prefix for an instruction
//    ignoring the target register.
//   Only the data size, base register, and index register for the instruction
//    are used.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilerex
//
// C prototype: 
//  void dg_compilerex (
//   Bufferhandle* pBHarrayhead,
//   dg_Sibformatter* psf,
//   UINT64 addresssize)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other  bufferhandles are stored.
//
//  dg_Sibformatter* psf          pointer to a dg_Sibformatter structure which
//                                 holds a description of an assembler
//                                 instruction target.
//                                 This holds the first target pulled from the
//                                 stack.
//
//  UINT64 addresssize            address size in bytes
//                                 if you are compiling in 64 bit address mode,
//                                  this is 8
//                                 if you are compiling in 32 bit address mode,
//                                  this is 4
//
// Outputs:
//  none
//                              
// Action:
//   If you are in 64 bit address mode, compiles the rex prefix for an instruction.
//   The data size, target register, base register, and index register for the
//    instruction are used.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen8tom8
//
// C prototype: 
//  void dg_compilen8tom8 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n8
//    dg_Sibformatter* pregpsf) // assumed to be m8 with m8 already set up
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 8 bit immediate target to 8 bit memory target form of an x86
//     instruction using the immediate value in pimmediatepsf and memory mode
//     specified in pregpsf.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen16tom16
//
// C prototype: 
//  void dg_compilen16tom16 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n16
//    dg_Sibformatter* pregpsf) // assumed to be m16 with m16 already set up
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 16 bit immediate target to 16 bit memory target form of an
//     x86 instruction using the immediate value in pimmediatepsf and memory
//     mode specified in pregpsf.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen32tom32
//
// C prototype: 
//  void dg_compilen32tom32 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n32
//    dg_Sibformatter* pmempsf) // assumed to be m32 with m32 already set up
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pmempsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 32 bit immediate target to 32 bit memory target form of an x86
//     instruction using the immediate value in pimmediatepsf and memory mode
//     specified in pregpsf.
//
// //////////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen8tom32
//
// C prototype: 
//  void dg_compilen8tom32 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n8 signed
//    dg_Sibformatter* pmempsf) // assumed to be m32 with m32 already set up
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target pulled
//                                         from the stack.
// Outputs:
//  none
//                              
// Action:
//   Compiles the 8 bit signed immediate target to 32 bit memory target form of
//     an x86 instruction using the immediate value in pimmediatepsf and memory
//     mode specified in pmempsf.
//   If the immediate value does not fit into an 8 bit signed value, the 32 bit
//     immediate to 32 bit memory form of the instruction is used instead.
//   If the minimum allowed size of the immediate value is greater than 8 bits, 
//     the 32 bit immediate to 32 bit memory form of the instruction is used
//     instead.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen8tom16
//
// C prototype: 
//  void dg_compilen8tom16 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n8
//    dg_Sibformatter* pmempsf) // assumed to be m16 with m16 already set up
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
//  dg_Sibformatter*        pmempsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 8 bit signed immediate target to 16 bit memory target form of 
//     an x86 instruction using the immediate value in pimmediatepsf and memory
//     mode specified in pmempsf.
//   If the immediate value does not fit into an 8 bit signed value, the 16 bit
//     immediate to 16 bit memory form of the instruction is used instead.
//   If the minimum allowed size of the immediate value is greater than 8 bits,
//     the 16 bit immediate to 16 bit memory form of the instruction is used
//     instead.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilentom
//
// C prototype: 
//  void dg_compilentom (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf,
//    dg_Sibformatter* pmempsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
//  dg_Sibformatter*        pmempsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then
//     compiles the immediate to memory target form of an x86 instruction
//     for that size using the immediate value in pimmediatepsf and memory mode
//     specified in pmempsf.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compileopcodeplusropstr
//
// C prototype: 
//  void dg_compileopcodeplusropstr (
//    Bufferhandle* pBHarrayhead,
//    const char* popcodestring,
//    UINT64 opcodestringlength,
//    UINT64 reg)
//
// Inputs:
//  Bufferhandle* pBHarrayhead        pointer to a Bufferhandle structure 
//                                     which is used as the bufferhandle for
//                                     the array where the otherbufferhandles
//                                     are stored.
//
//  const char*   popcodestring       pointer to an opcode string
//
//  UINT64        opcodestringlength  length of the opcodestring
//
//  UINT64        reg                 Diaperglu code id for an x86
//                                     register
//
// Outputs:
//  none
//                              
// Action:
//   If in 64 bit mode
//    and the instruction is not PUSH, or POP,
//    and the register target is 64 bits,
//    then a REX prefix is used and the REX size is 64 bits.
//   If in 64 bit mode
//    and the register target is a REX register
//    then a REX prefix is used with REX target reg is REX
//
//   Then adds the lower 3 bits of the register code (0-7) to the last byte of the
//    opcode string then compiles the opcode string.
//   This compiles the opcode plus r form of an x86 instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemem8
//
// C prototype: 
//  void dg_compilemem8 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 8 bit memory target opcode string,
//    this routine uses the 8 bit memory target opcode string and opcode extension
//    in the popcodes structure along with the memory target specified in
//    ptargetsf to compile an x86 instruction.
//  Otherwise, this routine pushes an error to the error stack. 
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilereg8
//
// C prototype: 
//  void dg_compilereg8 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 8 bit register target opcode string,
//    this routine uses the 8 bit register target opcode string in the
//    popcodes structure along with the register target specified in ptargetsf
//    to compile an x86 instruction.
//   Otherwise this routine passes control to dg_compilemem to attempt to 
//    compile the modrslashm for a register target form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilerega8
//
// C prototype: 
//  void dg_compilerega8 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 8 bit register a target opcode string and the
//    target register is al, this routine uses the string to compile the 8 bit
//    register a form of an x86 instruction.
//   Otherwise, this routine passes control to dg_compilereg to attempt to
//    compile the opcode plus r form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen8
//
// C prototype: 
//  void dg_compilen8 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 8 bit immediate target opcode string,
//     this routine uses the string along with the immediate value in
//     ptargetsf to compile the 8 bit immediate form of an x86 instruction.
//   Otherwise, this routine pushes an error to the error stack.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen16
//
// C prototype: 
//  void dg_compilen16 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 16 bit immediate target opcode string,
//     this routine uses the string along with the immediate value in ptargetsf
//     to compile the 16 bit immediate form of an x86 instruction.
//   Otherwise, this routine pushes an error to the error stack.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen16signextended
//
// C prototype: 
//  void dg_compilen16signextended (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 8 bit immediate sign extended to 16 bit
//    target opcode string, this routine uses the string along with the 
//    immediate value in ptargetsf to compile the 8 bit immediate sign extended
//    to 16 bit target form of an x86 instruction.
//   Otherwise, this routine passes control to dg_compilen16 to attempt to 
//    compile the 16 bit immediate form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen32
//
// C prototype: 
//  void dg_compilen32 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 32 bit immediate target opcode string,
//     this routine uses the string along with the immediate value in ptargetsf
//     to compile the 32 bit immediate form of an x86 instruction.
//   Otherwise, this routine pushes an error to the error stack.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen32signextended
//
// C prototype: 
//  void dg_compilen32signextended (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 8 bit immediate sign extended to 32 bit
//    target opcode string, this routine uses the string along with the
//    immediate value in ptargetsf to compile the 8 bit immediate sign extended
//    to 32 bit target form of an x86 instruction.
//   Otherwise, this routine passes control to dg_compilen32 to attempt to
//    compile the 32 bit immediate form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemem16
//
// C prototype: 
//  void dg_compilemem16 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 16 bit memory target opcode string,
//    this routine uses the 16 bit memory target opcode string and opcode
//    extension in the popcodes structure along with the memory target specified
//    in ptargetsf to compile an x86 instruction.
//  Otherwise, this routine pushes an error to the error stack. 
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilereg16
//
// C prototype: 
//  void dg_compilereg16 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 16 bit register target opcode string,
//    this routine uses the 16 bit register target opcode string in the
//    popcodes structure along with the register target specified in ptargetsf
//    to compile an x86 instruction.
//   Otherwise this routine passes control to dg_compilemem to attempt to compile
//     the modrslashm for a register target form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilerega16
//
// C prototype: 
//  void dg_compilerega16 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 16 bit register a target opcode string and 
//    the target register is al, this routine uses the string to compile the
//    16 bit register a form of an x86 instruction.
//   Otherwise, this routine passes control to dg_compilereg to attempt to
//    compile the opcode plus r form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilemem32
//
// C prototype: 
//  void dg_compilemem32 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 32 bit memory target opcode string,
//    this routine uses the 32 bit memory target opcode string and opcode
//    extension in the popcodes structure along with the memory target specified
//    in ptargetsf to compile an x86 instruction.
//  Otherwise, this routine pushes an error to the error stack. 
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilereg32
//
// C prototype: 
//  void dg_compilereg32 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 32 bit register target opcode string,
//    this routine uses the 32 bit register target opcode string in the popcodes
//    structure long with the register target specified in ptargetsf to
//    compile an x86 instruction.
//   Otherwise this routine passes control to dg_compilemem to attempt to compile
//     the modrslashm for a register target form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilerega32
//
// C prototype: 
//  void dg_compilerega32 (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   If popcodes structure has the 32 bit register a target opcode string and
//    the target register is al, this routine uses the string to compile the
//    32 bit register a form of an x86 instruction.
//   Otherwise, this routine passes control to dg_compilereg to attempt to
//    compile the opcode plus r form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilentarget
//
// C prototype: 
//  void dg_compilentarget (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then
//     compiles the immediate target form of an x86 instruction for that size
//     using the immediate value in ptargetsf.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compileregatarget
//
// C prototype: 
//  void dg_compileregatarget (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then
//    compiles the register a target form of an x86 instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compileregtarget
//
// C prototype: 
//  void dg_compileregtarget (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then
//     compiles the register target form of an x86 instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilememtarget
//
// C prototype: 
//  void dg_compilememtarget (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then
//     compiles the memory target form of an x86 instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compileonetarget
//
// C prototype: 
//  void dg_compileonetarget (
//    Bufferhandle* pBHarrayhead,
//    struct Onetargetopcodestrings* popcodes,
//    dg_Sibformatter* ptargetsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  dg_Sibformatter*        ptargetsf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target
//                                         pulled from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Determines the target type of ptargetsf, then passes control to the
//     correct compiling routine.
//
// Notes:
//   Target types are:
//     immediate
//     register a
//     register
//     memory
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fillonetargetmemonlyoptable
//
// C prototype: 
//  void dg_fillonetargetmemonlyoptable (
//    Bufferhandle* pBHarrayhead,
//    Onetargetopcodestrings* popcodes,
//    UINT64 baseopcode,
//    UINT64 opcodeextension)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//  Onetargetopcodestrings* popcodes      pointer to a Onetargetopcodestrings 
//                                         structure which holds the opcode
//                                         strings and opcode extentions for
//                                         the different addressing modes of an
//                                         x86 instruction.
//
//  UINT64                  baseopcode      value of one byte opcode for a one
//                                           target memory only instruction
//  UINT64                  opcodeextension opcode extension for one target
//                                           memory only instruction
//
// Outputs:
//  none
//                              
// Action:
//   Initializes a one target opcode table for an instruction which only has the
//    m8, m16, and m32 addressing modes with one byte opcodes.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen8tor8
//
// C prototype:
//
//  void dg_compilen8tor8 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n8
//    dg_Sibformatter* pregpsf) // assumed to be r8
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 8 bit immediate target to 16 bit register target form of
//    an x86 instruction using the immediate value in pimmediatepsf and register
//    specified in pregpsf.
//   If the opcode string for this mode is missing, control is passed
//    dg_compilen8tom8 to compile the modrslashm form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen16tor16
//
// C prototype:
//
//  void dg_compilen16tor16 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n16
//    dg_Sibformatter* pregpsf) // assumed to be r16
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 8 bit immediate target to 16 bit register target form of an x86
//    instruction using the immediate value in pimmediatepsf and register
//    specified in pregpsf.
//   If the opcode string for this mode is missing, control is passed
//    dg_compilen16tom16 to compile the modrslashm form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen32tor32
//
// C prototype:
//  void dg_compilen32tor32 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf, // assumed to be n32
//    dg_Sibformatter* pregpsf) // assumed to be r32
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 32 bit immediate target to 32 bit register target form of an
//    x86 instruction using the immediate value in pimmediatepsf and register
//    specified in pregpsf.
//   In 64 bit mode, this also compiles the 32 bit immediate target to 64 bit
//    register target form.
//   If the opcode string for this mode is missing, control is passed to
//    dg_compilen32tom32 to compile the modrslashm form of the instruction.
//   If in in 64 bit mode, the register size is 64 bits, and the opcode
//    is for a MOV, instruction, then the immediate size is promoted to 64 bits.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilentor
//
// C prototype:
//  void dg_compilentor (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf,
//    dg_Sibformatter* pregpsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then compiles one of the 
//    immediate to register target forms of an x86 instruction for that size
//    using the immediate value in pimmediatepsf and register specified in
//    pregpsf.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen8toa8
//
// C prototype:
//  void dg_compilen8toa8 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf,
//    dg_Sibformatter* pmempsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pmempsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 8 bit immediate target to 8 bit register a target form of an
//    x86 instruction using the immediate value in pimmediatepsf.
//   If the opcode string for this mode is missing, control is passed to
//    dg_compilen8tor8 to compile the opcode plus r form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen16toa16
//
// C prototype:
//  void dg_compilen16toa16 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf,
//    dg_Sibformatter* pmempsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pmempsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 16 bit immediate target to 16 bit register a target form of an
//    x86 instruction using the immediate value in pimmediatepsf.
//   If the opcode string for this mode is missing, control is passed to
//    dg_compilen16tor16 to compile the opcode plus r form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilen32toa32
//
// C prototype:
//  void dg_compilen32toa32 (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf,
//    dg_Sibformatter* pmempsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pmempsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Compiles the 32 bit immediate target to 32 bit register a target form of
//    an x86 instruction using the immediate value in pimmediatepsf.
//   If the opcode string for this mode is missing, control is passed
//    to dg_compilen32tor32 to compile the opcode plus r form of the instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilentoa
//
// C prototype:
//  void dg_compilentoa (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pimmediatepsf,
//    dg_Sibformatter* pregpsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pimmediatepsf pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the immediate target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then compiles one of the 
//     immediate to register a target forms of an x86 instruction for that size
//     using the immediate value in pimmediatepsf.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compilertom
//
// C prototype:
//  void dg_compilertom (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pregpsf,
//    dg_Sibformatter* pmempsf)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pregpsf       pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the register target pulled
//                                         from the stack.
//
//  dg_Sibformatter*        pmempsf       pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the memory target pulled
//                                         from the stack.
//
// Outputs:
//  none
//                              
// Action:
//   Figures out the data size of the instruction, then compiles one of the 
//     register to memory target forms of an x86 instruction for that size
//     using the register target in pregsf, and the memory target in pmempsf.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_compiletwotargets
//
// C prototype:
//  void dg_compiletwotargets (
//    Bufferhandle* pBHarrayhead,
//    Twotargetopcodestrings* popcodes,
//    dg_Sibformatter* pfirsttarget,  // top on stack
//    dg_Sibformatter* psecondtarget) // second on stack
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  dg_Sibformatter*        pfirsttarget  pointer to a dg_Sibformatter structure 
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the first target pulled
//                                         from the stack. For modes with a
//                                         source and destination you can
//                                         choose, this is the default
//                                         destination.
//
//  dg_Sibformatter*        psecondtarget pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the second target pulled
//                                         from the stack. For modes with a
//                                         source and destination you can
//                                         choose, this is the default
//                                         source.
//
// Outputs:
//  none
//                              
// Action:
//   Determines the two target mode of the instruction then compiles the correct opcode
//     sequence if the mode is supported.
//
// Note:
//   Two target mode for the second pulled target to first pulled target is one of:
//     immediate to immediate - not supported for any instruction
//     immediate to default register
//     immediate to register
//     immediate to memory
//     default register to immediate
//     default register to default register
//     default register to register
//     default register to memory
//     register to immediate
//     register to default register
//     register to register
//     register to memory
//     memory to immediate
//     memory to default register
//     memory to register
//     memory to memory - currently not supported for any instruction
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targetmathoptbl
//
// C prototype: 
//  void dg_fill2targetmathoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes,
//    UINT64 mathopindex) // 0-7
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  UINT64                  mathopindex   (0-7). Code id for one of eight
//                                         possible two target math instructions.
//
// Outputs:
//  none
//                              
// Action:
//   Initializes a two target opcode table for an one of the 8 x86 math
//    instructions which are: ADC ADD AND CMP OR SUB SBB XOR
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targetadcxoptbl
//
// C prototype: 
//  void dg_fill2targetadcxoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
// Outputs:
//  none
//                              
// Action:
//   Initializes a two target opcode table the ADCX instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targetmovoptbl
//
// C prototype: 
//  void dg_fill2targetmovoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  UINT64                  mathopindex   (0-7). Code id for one of eight
//                                         possible two target math instructions.
//
// Outputs:
//  none
//                              
// Action:
//   Initializes the two target opcode table for the MOV instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targettestoptbl
//
// C prototype: 
//  void dg_fill2targettestoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
// Outputs:
//  none
//                              
// Action:
//   Initializes the two target opcode table for the TEST instruction.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targetmemonlyoptbl
//
// C prototype: 
//  void dg_fill2targetmemonlyoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes,
//    UINT64 memopcode)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  UINT64                  memopcode       base opcode byte for the instruction
//
//
// Outputs:
//  none
//                              
// Action:
//   Initializes the two target opcode table for the instructions which only
//    supports register to or from memory addressing modes with one byte opcode
//    strings.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targetmembonlyoptbl
//
// C prototype: 
//  void dg_fill2targetmembonlyoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes,
//    UINT64 memopcode)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  UINT64                  memopcode       base opcode byte for this instruction
//
// Outputs:
//  none
//                              
// Action:
//   Initializes the two target opcode table for the instructions which only 
//    supports register to or from memory addressing modes with two byte opcode
//    strings starting with a 0x0F byte.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targetmem32onlyoptbl
//
// C prototype: 
//  void dg_fill2targetmem32onlyoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes,
//    UINT64 memopcode)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  UINT64                  memopcode       base opcode byte for this instruction
//
// Outputs:
//  none
//                              
// Action:
//   Initializes the two target opcode table for the instructions which only
//    supports 32 bit register to memory addressing modes with one byte opcode
//    strings.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fill2targetm32bonlyoptbl
//
// C prototype: 
//  void dg_fill2targetm32bonlyoptbl (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes,
//    UINT64 memopcode)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
//  UINT64                  memopcode     base opcode byte for this instruction
//
// Outputs:
//  none
//                              
// Action:
//   Initializes the two target opcode table for the instructions which only  
//    supports 32 bit register to memory addressing modes with two byte opcode
//    strings starting with a 0x0F byte.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_pullandcompiletwotargets
//
// C prototype: 
//  void dg_pullandcompiletwotargets (
//    Bufferhandle* pBHarrayhead,
//    struct Twotargetopcodestrings* popcodes)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
//  Twotargetopcodestrings* popcodes      pointer to a Twotargetopcodestrings
//                                         structure which holds the opcode 
//                                         strings and opcode extentions for the 
//                                         different addressing modes of an x86
//                                         instruction.
//
// Outputs:
//  none
//                              
// Action:
//   Pulls two memory targets from the data stack, then determines the two target 
//    mode, then compiles the instruction using the correct opcode string and 
//    opcode extension from the popcodes structure.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaaacomma ( AAA, )
//
// C prototype:
//  void dg_forthaaacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 AAA instruction.
//  This instruction adjusts the AL register after addition for an unpacked
//   binary coded decimal result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AAA,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaadcomma ( AAD, )
//
// C prototype:
//  void dg_forthaadcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 AAD instruction.
//  This instruction adjusts the AX register after division for an unpacked
//   binary coded decimal result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AAD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaamcomma ( AAM, )
//
// C prototype:
//  void dg_forthaamcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 AAD instruction.
//  This instruction adjusts the AX register after multiplication for an
//   unpacked binary coded decimal result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AAM,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaascomma ( AAS, )
//
// C prototype:
//  void dg_forthaascomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 AAS instruction.
//  This instruction adjusts the AL register after subtraction for an
//   unpacked binary coded decimal result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AAS,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthadccomma ( ADC, )
//
// C prototype:
//  void dg_forthadccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15 or RIP
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. If N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the baseregister plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADC instruction. This opcode sequence adds the destination target to
//    the source target plus the carry bit, and puts the result in the
//    destination target, then changes the condition code flags accordingly.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  ADC,  // adds 12348000 plus the carry bit to EAX
//  27 N  CL  ADC,             // adds 27 plus the carry bit to CL
//  AX  RBX [R]  ADC,          // adds AX plus the carry bit to the 16 bit
//                             //  memory at the address in RBX
//  38 N  RDX [R]  32BIT ADC,  // size required, adds 38 plus the carry bit
//                             //  to the 32 bit memory at the address in RDX
//  ECX  EAX  ADC,             // adds ECX plus the carry bit to EAX
//  ECX <- EAX  ADC,        // adds EAX plus the carry bit to ECX
//  RAX R8 ADC,                // adds RAX plus the carry bit to R8
//  38 N  RAX R  ADC,          // adds 38 plus the carry bit to RAX using the 
//                             // n8 to n32 modr/m sign extended encoding
//
// Note:
//  Only 1 target can be a memory target.
//  In 64 bit mode, if you use RIP as a base reg, you can not have an index reg.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem ADC, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] ADC,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//  
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthadcxcomma ( ADCX, )
//
// C prototype:
//  void dg_forthadcxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADCX instruction. This opcode sequence adds the destination target to
//    the source target plus the carry bit, and puts the result in the
//    destination target, then changes the carry flag accordingly. The other
//    flags are not modified. This instruction does the same thing as the
//    ADC instruction except only the carry flag is modified.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  ECX  EAX  ADCX,            // adds ECX + carry flag to EAX
//  ECX <- EAX  ADCX,       // adds EAX + carry flag to ECX
//  RAX R8 ADCX,               // adds RAX + carry flag to R8
//
// Note:
//  Only 1 target can be a memory target.
//  The destination must be a register target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaddcomma ( ADD, )
//
// C prototype:
//  void dg_forthaddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. If N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R. 
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADD instruction. This opcode sequence adds the destination target to the 
//    source target and puts the result in the destination target, then changes the
//    condition code flags accordingly.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  ADD,  // adds 12348000 to EAX
//  27 N  CL  ADD,             // adds 27 to CL
//  AX  RBX [R]  ADD,          // adds AX to the 16 bit memory at the address
//                             //  in RBX
//  38 N  RDX [R]  32BIT ADD,  // size required, adds 38 to the 32 bit memory
//                             //  at the address in RDX
//  ECX  EAX  ADD,             // adds ECX to EAX
//  ECX <- EAX  ADD,        // adds EAX to ECX
//  RAX R8 ADD,                // adds RAX to R8
//  38 N  RAX R  ADD,          // adds 38 to RAX using the n8 to n32 modr/m sign
//                             //  extended encoding
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem ADD, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] ADD,
//   register POP,  is compiled.
//    
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//  
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaddpdcomma ( ADDPD, )
//
// C prototype:
//  void dg_forthaddpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADDPD instruction. This opcode sequence adds the two double floating
//   point values in the destination target to the two double floating point
//   values in the source target and puts the results in the destination target.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ADDPD,      // fadds the two double float values at the address
//                             //  in RBX to the two double float values in XMM0
//  XMM2  XMM0  ADDPD,         // fadds the two double float values in XMM2 to
//                             //  the two double float values in XMM0
//  XMM2 <- XMM0  ADDPD,    // fadds the two double float values in XMM0 to
//                             //  the two double float values in XMM2
//  XMM0 XMM8 ADDPD,           // fadds the two double float values in XMM0 to
//                             //  the two double float values in XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaddpdcomma ( VADDPD, )
//
// C prototype:
//  void dg_forthvaddpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VADDPD instruction. This opcode sequence adds each double floating
//   point value in target y to the corresponding double floating point
//   value in the source target and puts the results into the destination target.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VADDPD, 
//                          // [RBX][63:0]   fadd XMM1[63:0]    -> XMM0[63:0]
//                          // [RBX][127:64] fadd XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VADDPD,        
//                          // XMM2[63:0]   fadd XMM1[63:0]    -> XMM0[63:0]
//                          // XMM2[127:64] fadd XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 <- XMM1  XMM0  VADDPD,    
//                          // XMM0[63:0]   fadd XMM1[63:0]    -> XMM2[63:0]
//                          // XMM0[127:64] fadd XMM1[127:64]  -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VADDPD,
//                          // YMM0[63:0]    fadd YMM1[63:0]    -> YMM8[63:0]
//                          // YMM0[127:64]  fadd YMM1[127:64]  -> YMM8[127:64]
//                          // YMM0[191:128] fadd YMM1[191:128] -> YMM8[191:128]
//                          // YMM0[255:192] fadd YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaddpscomma ( ADDPS, )
//
// C prototype:
//  void dg_forthaddpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADDPS instruction. This opcode sequence adds the four single floating
//   point values in the destination target to the four single floating point
//   values in the source target and puts the results in the destination target.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ADDPS,      // fadds the four single float values at the address
//                             //  in RBX to the four single float values in XMM0
//  XMM2  XMM0  ADDPS,         // fadds the four single float values in XMM2 to
//                             //  the four single float values in XMM0
//  XMM2 <- XMM0  ADDPS,    // fadds the four single float values in XMM0 to
//                             //  the four single float values in XMM2
//  XMM0 XMM8 ADDPS,           // fadds the four single float values in XMM0 to
//                             //  the four single float values in XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaddpscomma ( VADDPS, )
//
// C prototype:
//  void dg_forthvaddpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VADDPS instruction. This opcode sequence adds each single floating
//   point value in target y to the corresponding single floating point values 
//   in the source target and puts the results into the destination target.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VADDPS,  
//                        // [RBX][31:0]   fadd XMM1[31:0]    -> XMM0[31:0] 
//                        // [RBX][63:32]  fadd XMM1[63:32]   -> XMM0[63:32]  
//                        // [RBX][95:64]  fadd XMM1[95:64]   -> XMM0[95:64]
//                        // [RBX][127:96] fadd XMM1[127:96]  -> XMM0[127:96]
//  
//  XMM2  XMM1  XMM0  VADDPS,
//                        // XMM2[31:0]   fadd XMM1[31:0]    -> XMM0[31:0] 
//                        // XMM2[63:32]  fadd XMM1[63:32]   -> XMM0[63:32]  
//                        // XMM2[95:64]  fadd XMM1[95:64]   -> XMM0[95:64]
//                        // XMM2[127:96] fadd XMM1[127:96]  -> XMM0[127:96]
//
//  XMM0 <- XMM1  XMM2  VADDPS,
//                        // XMM2[31:0]   fadd XMM1[31:0]    -> XMM0[31:0] 
//                        // XMM2[63:32]  fadd XMM1[63:32]   -> XMM0[63:32]  
//                        // XMM2[95:64]  fadd XMM1[95:64]   -> XMM0[95:64]
//                        // XMM2[127:96] fadd XMM1[127:96]  -> XMM0[127:96]
//
//  YMM0  YMM1  YMM8 VADDPS, 
//                        // YMM0[31:0]    fadd YMM1[31:0]    -> YMM8[31:0] 
//                        // YMM0[63:32]   fadd YMM1[63:32]   -> YMM8[63:32]  
//                        // YMM0[95:64]   fadd YMM1[95:64]   -> YMM8[95:64]
//                        // YMM0[127:96]  fadd YMM1[127:96]  -> YMM8[127:96]
//                        // YMM0[159:128] fadd YMM1[159:128] -> YMM8[159:128] 
//                        // YMM0[191:160] fadd YMM1[191:160] -> YMM8[191:160]  
//                        // YMM0[223:192] fadd YMM1[223:192] -> YMM8[223:192]
//                        // YMM0[255:224] fadd YMM1[255:224] -> YMM8[255:224]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaddsdcomma ( ADDSD, )
//
// C prototype:
//  void dg_forthaddsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADDSD instruction. This opcode sequence adds the double floating
//   point value in the lower 64 bits of the destination target to the single
//   floating point value in the lower 64 bits of the source target and puts
//   the result into the lower 64 bits of the destination target.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//   The upper 64 bits of the destinatation are unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ADDSD,      // fadds the double float value at the address
//                             //  in RBX to the double float value in XMM0
//  XMM2  XMM0  ADDSD,         // fadds the two double float value in XMM2 to
//                             //  the two double float value in XMM0
//  XMM2 <- XMM0  ADDSD,    // fadds the double float value in XMM0 to
//                             //  the double float value in XMM2
//  XMM0 XMM8 ADDSD,           // fadds the double float value in XMM0 to
//                             //  the double float value in XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaddsdcomma ( VADDSD, )
//
// C prototype:
//  void dg_forthvaddsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VADDSD instruction. This opcode sequence adds the double
//   precision floating point value in the lower 64 bits of the source to the
//   double precision floating point value in the lower 64 bits of target y
//   and puts the result into the lower 64 bits of the destination.
//   The upper 64 bits are copied from target y.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VADDSD,   
//                              // XMM1[63:0] fadd [RBX][63:0] -> XMM0[63:0]
//                              // XMM1[127:64]                -> XMM0[127:64]
//                              // 0                           -> YMM0[255:128]
//
//  XMM2  XMM1  XMM0  VADDSD,   // XMM1[63:0] fadd XMM2[63:0] -> XMM0[63:0]
//                              // XMM1[127:64]               -> XMM0[127:64]
//                              // 0                          -> YMM0[255:128]
//
//  XMM2 <- XMM1  XMM0  VADDSD, 
//                              // XMM1[63:0] fadd XMM0[63:0] -> XMM2[63:0]
//                              // XMM1[127:64]               -> XMM0[127:64]
//                              // 0                          -> YMM0[255:128]
//
//  XMM0  XMM1  XMM8  VADDSD,   // XMM1[63:0] fadd XMM0[63:0] -> XMM8[63:0]
//                              // XMM1[127:64]               -> XMM0[127:64]
//                              // 0                          -> YMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaddsscomma ( ADDSS, )
//
// C prototype:
//  void dg_forthaddsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADDSS instruction. This opcode sequence adds the single floating
//   point value in the lower 32 bits of the destination target to the single
//   floating point value in the lower 32 bits of the source target and puts the
//   result in the destination target.
//   The upper 48 bits of the destination are not changed.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ADDSS,      // fadds the single float value at the address
//                             //  in RBX to the single float value in XMM0
//  XMM2  XMM0  ADDPS,         // fadds the single float value in XMM2 to
//                             //  the single float value in XMM0
//  XMM2 <- XMM0  ADDPS,    // fadds the single float value in XMM0 to
//                             //  the single float value in XMM2
//  XMM0 XMM8 ADDPS,           // fadds the single float value in XMM0 to
//                             //  the single float value in XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaddsscomma ( VADDSS, )
//
// C prototype:
//  void dg_forthvaddsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VADDSS instruction. This opcode sequence adds the single
//   precision floating point value in the lower 32 bits of the source
//   to the single precision floating value in the lower 32 bits of target y 
//   and puts the result into the lower 32 bits of the destination.
//   The upper 96 bits of the destination are copied from target y.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VADDSS, 
//                        // XMM1[31:0] fadd [RBX][31:0] -> XMM0[31:0]
//                        // XMM1[127:32]                -> XMM0[127:32]
//                        // 0                           -> YMM0[255:128]
//
//  XMM2  XMM1  XMM0  VADDSS,       
//                        // XMM1[31:0] fadd XMM2[31:0]  -> XMM0[31:0]
//                        // XMM1[127:32]                -> XMM0[127:32]
//                        // 0                           -> YMM0[255:128]
//
//  XMM2 <-  XMM1  XMM0  VADDSS, 
//                        // XMM1[31:0] fadd XMM0[31:0]  -> XMM2[31:0]
//                        // XMM1[127:32]                -> XMM0[127:32]
//                        // 0                           -> YMM0[255:128]
//
//  XMM0  XMM1  XMM8 VADDSS,        
//                        // XMM1[31:0] fadd XMM0[31:0]  -> XMM8[31:0]
//                        // XMM1[127:32]                -> XMM0[127:32]
//                        // 0                           -> YMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaddsubpdcomma ( ADDSUBPD, )
//
// C prototype:
//  void dg_forthaddsubpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADDSUBPD instruction. This opcode sequence adds the high double floating
//   point value in the destination target to the high double floating point
//   value in the source target and puts the result in the high 64 bits of the
//   destination target, and also subtracts the low double floating point value
//   in the source target from the low double point value in the destination target
//   and puts the result into the low 64 bits of the destination target.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ADDSUBPD,   // fadds the high double float value at the address
//                             //  in RBX to the high double float value in XMM0
//                             // fsubs the high double float value at the address
//                             //  in RBX from the low double float value in XMM0
//  XMM2  XMM0  ADDSUBPD,      // fadds the double float value in XMM2 to the
//                             //  high double float value in XMM0
//                             // fsubs the low double float value in XMM2 from
//                             //  the low double float value in XMM0
//  XMM2 <- XMM0  ADDSUBPD, // fadds the high double float value in XMM0 to the
//                             //  high double float value in XMM2
//                             // fsubs the low double float value in XMM0 from
//                             //  the low double float value in XMM2
//  XMM0 XMM8 ADDSUBPD,        // fadds the high double float value in XMM0 to
//                             //  the high double float value in XMM8
//                             // fsubs the low double float value in XMM0 from
//                             //  the low double float value in XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaddsubpdcomma ( VADDSUBPD, )
//
// C prototype:
//  void dg_forthvaddsubpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VADDSUBPD instruction. This opcode sequence adds each odd indexed 
//   double floating point value in target y to the corresponding double 
//   floating point value in the source target and puts the results into the 
//   destination target. This opcode sequence also subtracts each even indexed
//   double floating point value in the source from the corresponding double
//   floating point value in target y and puts the results into the destination.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VADDSUBPD, 
//                          // XMM1[63:0]    fsub [RBX][63:0]   -> XMM0[63:0]
//                          // [RBX][127:64] fadd XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VADDSUBPD,        
//                          // XMM1[63:0]   fsub XMM2[63:0]    -> XMM0[63:0]
//                          // XMM2[127:64] fadd XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 <- XMM1  XMM0  VADDSUBPD,    
//                          // XMM1[63:0]   fsub XMM0[63:0]    -> XMM2[63:0]
//                          // XMM0[127:64] fadd XMM1[127:64]  -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VADDSUBPD,
//                          // YMM1[63:0]    fsub YMM0[63:0]    -> YMM8[63:0]
//                          // YMM0[127:64]  fadd YMM1[127:64]  -> YMM8[127:64]
//                          // YMM1[191:128] fsub YMM0[191:128] -> YMM8[191:128]
//                          // YMM0[255:192] fadd YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaddsubpscomma ( ADDSUBPS, )
//
// C prototype:
//  void dg_forthaddsubpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADDSUBPS instruction. This opcode sequence adds two of the four single
//   floating point values in the destination target to two of the four single
//   floating point values in the source target and puts the results in the
//   destination target. The other two of the four single floating point values
//   in the source are subtracted from the other two of the four single floating
//   point values in the destination and stored in the destination.
//   Floating point values 0 and 2 get subtracted, 1 and 3 get added.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ADDSUBPS,   // fadds single float values 1 and 3 at the
//                             //  address in RBX to single float values 1
//                             //  and 3 in XMM0
//                             // fsubs single float values 0 and 2 at the
//                             //  address in RBX from single float values
//                             //  0 and 2 in XMM0
//  XMM2  XMM0  ADDSUBPS,      // fadds single float values 1 and 3 in XMM2 to
//                             //  single float values 1 and 3 in XMM0
//                             // fsubs single float values 0 and 2 in XMM2 from
//                             //  single float values 0 and 2 in XMM0
//  XMM2 <- XMM0  ADDSUBPS, // fadds single float values 1 and 3 in XMM0 to
//                             //  single float values 1 and 3 in XMM2
//                             // fsubs single float values 0 and 2 in XMM0 from
//                             //  single float values 0 and 2 in XMM2
//  XMM0 XMM8 ADDSUBPS,        // fadds single float values 1 and 3 in XMM0 to
//                             //  single float values 1 and 3 in XMM8
//                             // fsubs single float values 0 and 2 in XMM0 from
//                             //  single float values 0 and 2 in XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaddsubpscomma ( VADDSUBPS, )
//
// C prototype:
//  void dg_forthvaddsubpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VADDSUBPS instruction. This opcode sequence adds each odd indexed 
//   single floating point value in target y to the corresponding single floating 
//   point values in the source target and puts the results into the destination 
//   target. This opcode sequence also subtracts each even indexed single 
//   floating point value in the source target from the corresponding single 
//   floating point value in target y and puts the results into the destination.
//   It looks like the flags are not modified.
//   Floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VADDSUBPS,  
//                        // XMM1[31:0]    fsub [RBX][31:0]    -> XMM0[31:0] 
//                        // [RBX][63:32]  fadd XMM1[63:32]   -> XMM0[63:32]  
//                        // XMM1[95:64]   fsub [RBX][95:64]   -> XMM0[95:64]
//                        // [RBX][127:96] fadd XMM1[127:96]  -> XMM0[127:96]
//  
//  XMM2  XMM1  XMM0  VADDSUBPS,
//                        // XMM1[31:0]   fsub XMM2[31:0]    -> XMM0[31:0] 
//                        // XMM2[63:32]  fadd XMM1[63:32]   -> XMM0[63:32]  
//                        // XMM1[95:64]  fsub XMM2[95:64]   -> XMM0[95:64]
//                        // XMM2[127:96] fadd XMM1[127:96]  -> XMM0[127:96]
//
//  XMM0 <- XMM1  XMM2  VADDSUBPS,
//                        // XMM1[31:0]   fsub XMM2[31:0]    -> XMM0[31:0] 
//                        // XMM2[63:32]  fadd XMM1[63:32]   -> XMM0[63:32]  
//                        // XMM1[95:64]  fsub XMM2[95:64]   -> XMM0[95:64]
//                        // XMM2[127:96] fadd XMM1[127:96]  -> XMM0[127:96]
//
//  YMM0  YMM1  YMM8 VADDSUBPS, 
//                        // YMM1[31:0]    fsub YMM0[31:0]    -> YMM8[31:0] 
//                        // YMM0[63:32]   fadd YMM1[63:32]   -> YMM8[63:32]  
//                        // YMM1[95:64]   fsub YMM0[95:64]   -> YMM8[95:64]
//                        // YMM0[127:96]  fadd YMM1[127:96]  -> YMM8[127:96]
//                        // YMM1[159:128] fsub YMM0[159:128] -> YMM8[159:128] 
//                        // YMM0[191:160] fadd YMM1[191:160] -> YMM8[191:160]  
//                        // YMM1[223:192] fsub YMM0[223:192] -> YMM8[223:192]
//                        // YMM0[255:224] fadd YMM1[255:224] -> YMM8[255:224]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthadoxcomma ( ADOX, )
//
// C prototype:
//  void dg_forthadoxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ADOX instruction. This opcode sequence adds the source, the
//   the destination, and the overflow flag then puts the result into the
//   destination. If there is a carry from the addition, the overflow flag
//   is set, otherwise it is cleared. No other flags are modified.
//   The size can be 4, or 8 bytes.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RAX [R]  ECX  ADOX,  // [RAX][31:0] + ECX + OF -> OF:ECX
//
//   RAX [R]  RCX  ADOX,  // [RAX][63:0] + RCX + OF -> OF:RCX
//
//   EAX  ECX  ADOX,      // EAX + ECX + OF -> OF:ECX
//
//   RAX  RCX  ADOX,      // RAX + RCX + OF -> OF:RCX
//
// Note:
//  This instruction is useful for adding large integers when you need to
//   preserve the flags.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaesdeccomma ( AESDEC, )
//
// C prototype:
//  void dg_forthaesdeccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 AESDEC instruction. This opcode sequence does one round of AES
//   decryption flow. The source holds the 128 bit key and can be an xmm register
//   or memory. The destination has to be an xmm register and holds the 128 bit
//   state.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  AESDEC,     // Does one round of AES decryption on the value
//                             //  in XMM0 using the key from the memory at the
//                             //  address in RBX
//  XMM2  XMM0  AESDEC,        // Does one round of AES decryption on the value
//                             //  in XMM0 using the key from XMM2
//  XMM2 <- XMM0  AESDEC,   // Does one round of AES decryption on the value
//                             //  in XMM2 using the key from XMM0
//  XMM0 XMM8 AESDEC,          // Does one round of AES decryption on the value
//                             //  in XMM8 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaesdeccomma ( VAESDEC, )
//
// C prototype:
//  void dg_forthvaesdeccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter lists for target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VAESDEC instruction. This opcode sequence does one round of AES
//   decryption flow. The source holds the 128 bit key and can be an xmm register
//   or memory. The target y has to be an xmm register and holds the 128 bit
//   state. The result gets put into the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VAESDEC, // Does one round of AES decryption on the value
//                                //  in XMM1 using the key from the memory at the
//                                //  address in RBX
//  XMM2  XMM1  XMM0  VAESDEC,     // Does one round of AES decryption on the value
//                                 //  in XMM1 using the key from XMM2
//  XMM2 <- XMM1  XMM0  VAESDEC, // Does one round of AES decryption on the value
//                                  //  in XMM1 using the key from XMM0
//  XMM0  XMM1  XMM8  VAESDEC,      // Does one round of AES decryption on the value
//                                  //  in XMM1 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must be 
//   an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaesdeclastcomma ( AESDECLAST, )
//
// C prototype:
//  void dg_forthaesdeclastcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 AESDECLAST instruction. This opcode sequence does the last round of AES
//   decryption flow. The source holds the 128 bit key and can be an xmm register
//   or memory. The destination has to be an xmm register and holds the 128 bit state.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  AESDECLAST,   // Does the last round of AES decryption on the value
//                               //  in XMM0 using the key from the memory at the
//                               //  address in RBX
//  XMM2  XMM0  AESDECLAST,      // Does the last round of AES decryption on the value
//                               //  in XMM0 using the key from XMM2
//  XMM2 <- XMM0  AESDECLAST, // Does the last round of AES decryption on the value
//                               //  in XMM2 using the key from XMM0
//  XMM0 XMM8 AESDECLAST,        // Does the last round of AES decryption on the value
//                               //  in XMM8 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaesdeclastcomma ( VAESDECLAST, )
//
// C prototype:
//  void dg_forthvaesdeclastcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VAESDECLAST instruction. This opcode sequence does the last round of AES
//   decryption flow. The source holds the 128 bit key and can be an xmm register
//   or memory. Target y has to be an xmm register and holds the 128 bit state.
//   The result is put into the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VAESDECLAST,   
//                               // Does the last round of AES decryption on the value
//                               //  in XMM1 using the key from the memory at the
//                               //  address in RBX
//  XMM2  XMM1  XMM0  VAESDECLAST,      
//                               // Does the last round of AES decryption on the value
//                               //  in XMM1 using the key from XMM2
//  XMM2 <-  XMM1  XMM0  VAESDECLAST, 
//                               // Does the last round of AES decryption on the value
//                               //  in XMM1 using the key from XMM0
//  XMM0  XMM1  XMM8 VAESDECLAST,        
//                               // Does the last round of AES decryption on the value
//                               //  in XMM1 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must be 
//   an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaesenccomma ( AESENC, )
//
// C prototype:
//  void dg_forthaesenccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 AESENC instruction. This opcode sequence does one round of AES
//   encryption flow. The source holds the 128 bit key and can be an xmm register
//   or memory. The destination has to be an xmm register and holds the 128 bit
//   state.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  AESENC,     // Does one round of AES encryption on the value
//                             //  in XMM0 using the key from the memory at the
//                             //  address in RBX
//  XMM2  XMM0  AESENC,        // Does one round of AES encryption on the value
//                             //  in XMM0 using the key from XMM2
//  XMM2 <- XMM0  AESENC,   // Does one round of AES encryption on the value
//                             //  in XMM2 using the key from XMM0
//  XMM0 XMM8 AESENC,          // Does one round of AES encryption on the value
//                             //  in XMM8 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaesenccomma ( VAESENC, )
//
// C prototype:
//  void dg_forthvaesenccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VAESENC instruction. This opcode sequence does one round of AES
//   encryption flow. The source holds the 128 bit key and can be an xmm register
//   or memory. Target y has to be an xmm register and holds the 128 bit
//   state. The result get puts into the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VAESENC,     
//                             // Does one round of AES encryption on the value
//                             //  in XMM1 using the key from the memory at the
//                             //  address in RBX
//  XMM2  XMM1  XMM0  VAESENC,  // Does one round of AES encryption on the value
//                              //  in XMM1 using the key from XMM2
//  XMM2 <-  XMM1  XMM0  VAESENC,   
//                             // Does one round of AES encryption on the value
//                             //  in XMM1 using the key from XMM0
//  XMM0  XMM1  XMM8 VAESENC,   // Does one round of AES encryption on the value
//                              //  in XMM1 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must be 
//   an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaesenclastcomma ( AESENCLAST, )
//
// C prototype:
//  void dg_forthaesenclastcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 AESENCLAST instruction. This opcode sequence does the last round of AES
//   encryption flow. The source holds the 128 bit key and can be an xmm register or
//   memory. The destination has to be an xmm register and holds the 128 bit state.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  AESENCLAST,   // Does the last round of AES encryption on the value
//                               //  in XMM0 using the key from the memory at the
//                               //  address in RBX
//  XMM2  XMM0  AESENCLAST,      // Does the last round of AES encryption on the value
//                               //  in XMM0 using the key from XMM2
//  XMM2 <- XMM0  AESENCLAST, // Does the last round of AES encryption on the value
//                               //  in XMM2 using the key from XMM0
//  XMM0 XMM8 AESENCLAST,        // Does the last round of AES encryption on the value
//                               //  in XMM8 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaesenclastcomma ( VAESENCLAST, )
//
// C prototype:
//  void dg_forthvaesenclastcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VAESENCLAST instruction. This opcode sequence does the last round of AES
//   encryption flow. The source holds the 128 bit key and can be an xmm register or
//   memory. Target y has to be an xmm register and holds the 128 bit state. The
//   result gets put into the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VAESENCLAST,   
//                               // Does the last round of AES encryption on the value
//                               //  in XMM1 using the key from the memory at the
//                               //  address in RBX
//  XMM2  XMM1  XMM0  VAESENCLAST,      
//                               // Does the last round of AES encryption on the value
//                               //  in XMM1 using the key from XMM2
//  XMM2 <- XMM1  XMM0  VAESENCLAST, 
//                               // Does the last round of AES encryption on the value
//                               //  in XMM1 using the key from XMM0
//  XMM0  XMM1  XMM8  VAESENCLAST,        
//                               // Does the last round of AES encryption on the value
//                               //  in XMM1 using the key from XMM0
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must be 
//   an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaeskeygenassistcomma ( AESKEYGENASSIST, )
//
// C prototype:
//  void dg_forthaeskeygenassistcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 AESKEYGENASSIST instruction. This opcode sequence performs the
//   AESKEYGENASSIST function on the value in the source target using the round
//   key constant in the immediate value. The result is put into the destination
//   target.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  RBX [R]  XMM0  AESKEYGENASSIST,  // AESKEYGENASSIST(28, [RBX][127:0])
//                                         //  -> XMM0[127:0]
//
//  28 N  XMM2  XMM0  AESKEYGENASSIST,     // AESKEYGENASSIST(28, XMM2[127:0])
//                                         //  -> XMM0[127:0]
//
//  28 N  XMM2 <- XMM0  AESKEYGENASSIST, // AESKEYGENASSIST(28, XMM0[127:0])
//                                          //  -> XMM2[127:0]
//
//  28 N  XMM0  XMM8 AESKEYGENASSIST,       // AESKEYGENASSIST(28, XMM0[127:0])
//                                          //  -> XMM8[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaeskeygenassistcomma ( VAESKEYGENASSIST, )
//
// C prototype:
//  void dg_forthvaeskeygenassistcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VAESKEYGENASSIST instruction. This opcode sequence performs the
//   VAESKEYGENASSIST function on the value in the source target using the round
//   key constant in the immediate value. The result is put into the destination
//   target.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  RBX [R]  XMM0  VAESKEYGENASSIST, // AESKEYGENASSIST(28, [RBX][127:0])
//                                         //  -> XMM0[127:0]
//
//  28 N  XMM2  XMM0  VAESKEYGENASSIST,    // AESKEYGENASSIST(28, XMM2[127:0])
//                                         //  -> XMM0[127:0]
//
//  28 N  XMM2 <- XMM0  VAESKEYGENASSIST, // AESKEYGENASSIST(28, XMM0[127:0])
//                                           //  -> XMM2[127:0]
//
//  28 N  XMM0  XMM8 VAESKEYGENASSIST,      // AESKEYGENASSIST(28, XMM0[127:0])
//                                          //  -> XMM8[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaesimccomma ( AESIMC, )
//
// C prototype:
//  void dg_forthaesimccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 AESIMC instruction. This opcode sequence does an inverse mix columns
//   transformation on the 128 bit source and stores the result in the destination.
//   The source can be an xmm register or memory.
//   The destination has to be an xmm register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  AESIMC,       // Does the inverse mix column transformation on the
//                               //  value at the address in RBX and stores the result
//                               //  to XMM0
//  XMM2  XMM0  AESIMC,          // Does the inverse mix column transformation on the
//                               //  value in XMM2 and stores the result to XMM0
//  XMM2 <- XMM0  AESIMC,     // Does the inverse mix column transformation on the
//                               //  value in XMM0 and stores the result to XMM2
//  XMM0 XMM8 AESIMC,            // Does the inverse mix column transformation on the
//                               //  value in XMM0 and stores the result to XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvaesimccomma ( VAESIMC, )
//
// C prototype:
//  void dg_forthvaesimccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VAESIMC instruction. This opcode sequence does an inverse mix columns
//   transformation on the 128 bit source and stores the result in the destination.
//   The source can be an xmm register or memory.
//   The destination has to be an xmm register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VAESIMC,      // Does the inverse mix column transformation on the
//                               //  value at the address in RBX and stores the result
//                               //  to XMM0
//  XMM2  XMM0  VAESIMC,         // Does the inverse mix column transformation on the
//                               //  value in XMM2 and stores the result to XMM0
//  XMM2 <- XMM0  VAESIMC,    // Does the inverse mix column transformation on the
//                               //  value in XMM0 and stores the result to XMM2
//  XMM0  XMM8  VAESIMC,         // Does the inverse mix column transformation on the
//                               //  value in XMM0 and stores the result to XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthandcomma ( AND, )
//
// C prototype:
//  void dg_forthandcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. If N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 AND instruction. This opcode sequence binary ands the destination
//   target with the source target and stores the result in the destination, 
//   changing the condition code flags accordingly.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  AND,  // ands 12348000 with EAX to EAX
//  27 N  CL  AND,             // ands 27 with CL to CL
//  AX  EBX [R]  AND,          // ands AX with the 16 bit memory at the address
//                             //  in EBX to the 16 bit memory at the address
//                             //  in EBX
//  38 N  EDX [R]  32BIT AND,  // size required, ands 38 with the 32 bit memory
//                             //  at the address in EDX to the 32 bit memory
//                             //  at the address in EDX
//  ECX  EAX  AND,             // ands ECX with EAX to EAX
//  ECX <- EAX  AND,        // ands EAX with ECX to ECX
//  -38 N  RAX R  AND,         // ands -38 with RAX using the n8 to n64 modr/m 
//                             //  sign extended encoding 
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem AND, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] AND,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthandncomma ( ANDN, )
//
// C prototype:
//  void dg_forthandncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for targetyparameterlist can
//   contain this addressing mode specifiers:
//
//   targetregister
//
//  The parameter list for targetxparameterlist and targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 ANDN instruction. This opcode sequence bitwise ands the source
//   target with the logical inverse of target y and puts the result into the 
//   destination target.
//   If the high bit of the result is set, the sign bit is set.
//   If the result is zero, the zero flag is set.
//   The overflow and carry flags are cleared.
//   The other flags are not modified.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RAX [R]  EDX  ECX  ANDN,  // [RAX][31:0] and (not EDX) -> ECX
//
//   RAX [R]  RDX  RCX  ANDN,  // [RAX][63:0] and (not RDX) -> RCX
//
//   RAX  RDX  RCX  ANDN,      // RAX and (not RDX) -> RCX
//
//   RCX <  RDX  RAX  ANDN, // RAX and (not RDX) -> RCX
//
// Note:
//  Putting reverse after any target makes the first target pushed the
//   destination target, and the third target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthandnpdcomma ( ANDNPD, )
//
// C prototype:
//  void dg_forthandnpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ANDNPD instruction. Intel docs say this opcode sequence does a bitwise
//   and of the inverse of the two double floating point values in the destination
//   target with the two double floating point values in the source target and puts
//   the results in the destination target. Except for the flags, it's really just
//   the same thing as doing a binary invert of the destination and then a 128 bit
//   binary and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ANDNPD,     // [RBX] and (not XMM0) -> XMM0
//  XMM2  XMM0  ANDNPD,        // XMM2 and (not XMM0) -> XMM0
//  XMM2 <- XMM0  ANDNPD,   // XMM0 and (not XMM2) -> XMM2
//  XMM0 XMM8 ANDNPD,          // XMM0 and (not XMM8) -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvandnpdcomma ( VANDNPD, )
//
// C prototype:
//  void dg_forthvandnpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VANDNPD instruction. Intel docs say this opcode sequence does a bitwise
//   and of the inverse of the two double floating point values in target y
//   with the two double floating point values in the source target and puts
//   the results in the destination target. Except for the flags, it's really just
//   the same thing as doing a binary invert of target y and then a 128 bit
//   binary and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VANDNPD,     // [RBX] and (not XMM1) -> XMM0
//  XMM2 XMM1 XMM0  VANDNPD,        // XMM2  and (not XMM1) -> XMM0
//  XMM2 <- XMM1 XMM0  VANDNPD,  // XMM0  and (not XMM1) -> XMM2
//  XMM0 XMM1 XMM8 VANDNPD,         // XMM0  and (not XMM1) -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthandnpscomma ( ANDNPS, )
//
// C prototype:
//  void dg_forthandnpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ANDNPS instruction. Intel docs say this opcode sequence does a
//   bitwise inversion of the four single floating point values in the destination
//   then ands them with the four single floating point values in the source target
//   and puts the results in the destination target.
//   Except for the flags, it's really just a 128 bit binary inversion of the
//   destination then a 128 bit binary bitwise and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ANDNPS,      // [RBX] and (not XMM0) -> XMM0
//  XMM2  XMM0  ANDNPS,         // XMM2 and (not XMM0) -> XMM0
//  XMM2 <- XMM0  ANDNPS,    // XMM0 and (not XMM2) -> XMM2
//  XMM0 XMM8 ANDNPS,           // XMM0 and (not XMM8) -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvandnpscomma ( VANDNPS, )
//
// C prototype:
//  void dg_forthvandnpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VANDNPS instruction. Intel docs say this opcode sequence does a
//   bitwise inversion of the four single floating point values in target y
//   then ands them with the four single floating point values in the source
//   target and puts the results in the destination target.
//   Except for the flags, it's really just a 128 bit binary inversion of the
//   destination then a 128 bit binary bitwise and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VANDNPS,      // [RBX] and (not XMM1) -> XMM0
//  XMM2  XMM1  XMM0  VANDNPS,         // XMM2 and (not XMM1) -> XMM0
//  XMM2 <- XMM1  XMM0  VANDNPS,    // XMM0 and (not XMM1) -> XMM2
//  XMM0 XMM1  XMM8 VANDNPS,           // XMM0 and (not XMM1) -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthandpdcomma ( ANDPD, )
//
// C prototype:
//  void dg_forthandpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ANDPD instruction. Intel docs say this opcode sequence does a bitwise
//   and of the two double floating point values in the destination target with the
//   two double floating point values in the source target and puts the results
//   in the destination target. It's really just the same thing as a 128 bit
//   binary and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ANDPD,      // [RBX] and XMM0 -> XMM0
//  XMM2  XMM0  ANDPD,         // XMM2 and XMM0 -> XMM0
//  XMM2 <- XMM0  ANDPD,    // XMM0 and XMM2 -> XMM2
//  XMM0 XMM8 ANDPD,           // XMM0 and XMM8 -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvandpdcomma ( VANDPD, )
//
// C prototype:
//  void dg_forthvandpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VANDPD instruction. Intel docs say this opcode sequence does a 
//   bitwise and of the two or four double floating point values in target y with 
//   the two or four double floating point values in the source target and puts 
//   the  results in the destination target. It's really just the same thing as a 
//   128 bit or 256 bit binary and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VANDPD,      // [RBX] and XMM1 -> XMM0
//  XMM2  XMM1  XMM0  VANDPD,         // XMM2 and  XMM1 -> XMM0
//  XMM2 <- XMM1  XMM0  VANDPD,    // XMM0 and  XMM1 -> XMM2
//  YMM0  YMM1  YMM8 VANDPD,          // YMM0 and  YMM1 -> YMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthandpscomma ( ANDPS, )
//
// C prototype:
//  void dg_forthandpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ANDPS instruction. Intel docs say this opcode sequence does a bitwise
//   and of the four single floating point values in the destination target with
//   the four single floating point values in the source target and puts the
//   results in the destination target. It's really just a 128 bit binary and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ANDPS,      // [RBX] and XMM0 -> XMM0
//  XMM2  XMM0  ANDPS,         // XMM2 and XMM0 -> XMM0
//  XMM2 <- XMM0  ANDPS,    // XMM0 and XMM2 -> XMM2
//  XMM0 XMM8 ANDPS,           // XMM0 and XMM8 -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvandpscomma ( VANDPS, )
//
// C prototype:
//  void dg_forthvandpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VANDPS instruction. Intel docs say this opcode sequence does a bitwise
//   and of the four single floating point values in target y with
//   the four single floating point values in the source target and puts the
//   results in the destination target. It's really just a 128 bit binary and.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VANDPS,      // [RBX] and XMM1 -> XMM0
//  XMM2  XMM1  XMM0  VANDPS,         // XMM2 and XMM1 -> XMM0
//  XMM2 <- XMM1  XMM0  VANDPS,    // XMM0 and XMM1 -> XMM2
//  XMM0 XMM1  XMM8 VANDPS,           // XMM0 and XMM1 -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_fortharplcomma ( ARPL, )
//
// C prototype:
//  void dg_fortharplcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, you can use this:
//   16BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ARPL instruction. This opcode sequence assumes the source and
//   destination values are segment selectors and changes bits 1 and 0
//   of the destination to be equal to bits 1 and 0 of
//   the source if bits 1 and 0 of the destination are less than the source.
//   Lower values of bits 1 and 0 mean higher privelege levels, so this
//   opcode sequence lowers the privelege level of the destination to be
//   the same as the source if needed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   CX   ARPL,       // if CX[1:0] < [RAX][1:0] then
//                             //  [RAX][1:0] -> CX[1:0]
//
// Note:
//  Only 16 memory and register targets are supported.
//  Reverse is not supported. The source must be a register target.
//  The destination can be a memory or register target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbextrcomma ( BEXTR, )
//
// C prototype:
//  void dg_forthbextrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for targetxparameterlist can
//   contain this addressing mode specifiers:
//
//   targetregister
//
//  The parameter list for targetxparameterlist and targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 BEXTR instruction. This opcode sequence extracts a substring of bits
//   from the source using the start offset and length from target x and putting
//   the result into the beginning of the destination. The remaining destination
//   bits are cleared. If the value put into the destination is 0, the ZF is set.
//   The start offset is from the lowest byte of target x. The length is from
//   the second lowest byte of target x.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   EDX  RAX [R]  ECX  BEXTR,  // [RAX][EDX[7:0]+EDX[15:8]-1:EDX[7:0]] ->
//                              //  ECX[EDX[15:8]-1:0]
//                              // 0 -> ECX[31:EDX[15:8]]
//
//   RDX  RAX [R]  RCX  BEXTR,  // [RAX][RDX[7:0]+RDX[15:8]-1:RDX[7:0]] ->
//                              //  RCX[RDX[15:8]-1:0]
//                              // 0 -> RCX[63:RDX[15:8]]
//
//   RDX  RAX  RCX  BEXTR,      // RAX[RDX[7:0]+RDX[15:8]-1:RDX[7:0]] ->
//                              //  RCX[RDX[15:8]-1:0]
//                              // 0 -> RCX[63:RDX[15:8]]
//
//   RDX  RCX <  RAX  BEXTR, // RAX[RDX[7:0]+RDX[15:8]-1:RDX[7:0]] ->
//                              //  RCX[RDX[15:8]-1:0]
//                              // 0 -> RCX[63:RDX[15:8]]
//
// Note:
//  Putting reverse after any target makes the second target pushed the
//   destination target, and the third target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthblendpdcomma ( BLENDPD, )
//
// C prototype:
//  void dg_forthblendpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 BLENDPD instruction. This opcode sequence copies each of 2 64 bit
//   values from the source if the bit in the immediate target matching the
//   position in the source is set. If the bit in the immediate target is
//   clear the destination is unchanged. Only bits 0 and 1 of the immediate
//   target are used. For example, if bit 0 of the immediate target is set,
//   then the lowest 64 bits of the source are copied to the destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  01 N  RBX [R]  XMM0  BLENDPD,   // [RBX][63:0]   -> XMM0[63:0]
//
//  02 N  RBX [R]  XMM0  BLENDPD,   // [RBX][127:64]  -> XMM0[127:64]
//
//  03 N  RBX [R]  XMM0  BLENDPD,   // [RBX][127:0]  -> XMM0[127:0]
//
//  02 N  XMM2  XMM0  BLENDPD,      // XMM2[127:64] -> XMM0[127:64]
//
//  02 N  XMM2 <- XMM0  BLENDPD, // XMM0[127:64]  -> XMM2[127:64]
//
//  02 N  XMM0  XMM8 BLENDPD,       // XMM0[127:64]  -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvblendpdcomma ( VBLENDPD, )
//
// C prototype:
//  void dg_forthvblendpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist targetxparameterlist targetyparameterlist
//    targetzparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VBLENDPD instruction. This opcode sequence copies each of 2 64 bit
//   values from the source if the bit in the immediate target matching the
//   position in the source is set. If the bit in the immediate target is
//   clear the corresponding 64 bit value from target y is copied to the
//   destination. Only bits 0 and 1 of the immediate
//   target are used. For example, if bit 0 of the immediate target is set,
//   then the lowest 64 bits of the source are copied to the destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  01 N  RBX [R]  XMM1  XMM0  VBLENDPD,   // [RBX][63:0]   -> XMM0[63:0]
//                                         // XMM1[127:64]  -> XMM0[127:64]
//
//  02 N  RBX [R]  XMM1  XMM0  VBLENDPD,   // XMM1[63:0]    -> XMM0[63:0]
//                                         // [RBX][127:64] -> XMM0[127:64]
//
//  03 N  RBX [R]  XMM1  XMM0  VBLENDPD,   // [RBX][127:0]  -> XMM0[127:0]
//
//  02 N  XMM2  XMM1  XMM0  VBLENDPD,      // XMM1[63:0]    -> XMM0[63:0]
//                                         // XMM2[127:64]  -> XMM0[127:64]
//
//                                         // XMM1[63:0]    -> XMM0[63:0]
//  02 N  XMM2 <- XMM1  XMM0  VBLENDPD, // XMM0[127:64]  -> XMM2[127:64]
//
//  02 N  XMM0  XMM1  XMM8 VBLENDPD,       // XMM1[63:0]    -> XMM0[63:0]
//                                         // XMM0[127:64]  -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthblendpscomma ( BLENDPS, )
//
// C prototype:
//  void dg_forthblendpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 BLENDPS instruction. This opcode sequence copies each of 4 32 bit
//   values from the source if the bit in the immediate target matching the
//   position in the source is set. If the bit in the immediate target is
//   clear the destination is unchanged. For example, if bit 0 of the immediate
//   target is set, then lowest 32 bits of the source are copied to the
//   destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  01 N  RBX [R]  XMM0  BLENDPS,   // [RBX][31:0]   -> XMM0[31:0]
//
//  02 N  RBX [R]  XMM0  BLENDPS,   // [RBX][63:32]  -> XMM0[63:32]
//
//  04 N  RBX [R]  XMM0  BLENDPS,   // [RBX][95:64]  -> XMM0[95:64]
//
//  08 N  RBX [R]  XMM0  BLENDPS,   // [RBX][127:96] -> XMM0[127:96]
//
//  0F N  RBX [R]  XMM0  BLENDPS,   // [RBX][127:0]  -> XMM0[127:0]
//
//  0C N  RBX [R]  XMM0  BLENDPS,   // [RBX][95:64]  -> XMM0[95:64]
//                                  // [RBX][127:96] -> XMM0[127:96]
//
//  02 N  XMM2  XMM0  BLENDPS,      // XMM2[63:32]  -> XMM0[63:32]
//
//  02 N  XMM2 <- XMM0  BLENDPS, // XMM0[63:32]  -> XMM2[63:32]
//
//  02 N  XMM0  XMM8 BLENDPS,       // XMM0[63:32]  -> XMM8[63:32]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size,
//   it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvblendpscomma ( VBLENDPS, )
//
// C prototype:
//  void dg_forthvblendpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist targetxparameterlist targetyparameterlist
//    targetzparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VBLENDPS instruction. This opcode sequence copies each of 4 32 bit
//   values from the source to the destination if the bit in the immediate
//   target matching the position in the source is set. If the bit in the
//   immediate target is clear then the corresponding value in target y is
//   copied to the destination instead. For example, if bit 0 of the immediate
//   target is set, then the lowest 32 bits of the source are copied to the
//   destination. If bit 1 of the immediate target is clear, then the lowest
//   32 bits of target y are copied to the destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  01 N  RBX [R]  XMM1  XMM0  VBLENDPS,   // [RBX][31:0]   -> XMM0[31:0]
//                                         // XMM1[127:32]  -> XMM0[127:32]
//
//  02 N  RBX [R]  XMM1  XMM0  VBLENDPS,   // XMM1[31:0]   -> XMM0[31:0]
//                                         // [RBX][63:32] -> XMM0[63:32]
//                                         // XMM1[127:64] -> XMM0[127:64]
//                            
//  04 N  RBX [R]  XMM1  XMM0  VBLENDPS,   // XMM1[63:0]   -> XMM0[63:0]
//                                         // [RBX][95:64] -> XMM0[95:64]
//                                         // XMM1[127:96] -> XMM0[127:96]
//
//  08 N  RBX [R]  XMM1  XMM0  VBLENDPS,   // XMM1[95:0]   -> XMM0[95:0]
//                                         // [RBX][127:96]  -> XMM0[127:96]
//
//  0F N  RBX [R]  XMM1  XMM0  VBLENDPS,   // [RBX][127:0]  -> XMM0[127:0]
//
//  0C N  RBX [R]  XMM1  XMM0  VBLENDPS,   // XMM1[63:0]   -> XMM0[63:0]
//                                         // [RBX][127:64] -> XMM0[127:64]
//
//  02 N  XMM2  XMM1  XMM0  VBLENDPS,      // XMM1[31:0]   -> XMM0[31:0]
//                                         // XMM2[63:32]  -> XMM0[63:32]
//                                         // XMM1[127:64] -> XMM0[127:64]
//
//  02 N  XMM0 <- XMM1  XMM2  VBLENDPS, // XMM1[31:0]   -> XMM0[31:0]
//                                         // XMM2[63:32]  -> XMM0[63:32]
//                                         // XMM1[127:64] -> XMM0[127:64]
//
//  02 N  XMM0  XMM1  XMM8 VBLENDPS,       // XMM1[31:0]   -> XMM8[31:0]
//                                         // XMM2[63:32]  -> XMM8[63:32]
//                                         // XMM1[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size,
//   it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthblendvpdcomma ( BLENDVPD, )
//
// C prototype:
//  void dg_forthblendvpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 BLENDVPD instruction. Intel docs say if bit 63 of XMM0 is 0 then the
//   low floating point double floating point value in the source is copied to the
//   destination, otherwise the low double floating point value in the destination
//   is unchanged. Also, if bit 127 of XMM0 is 0, then the high floating point
//   double value in the source is copied to the destination, otherwise the high
//   floating point value in the destination is unchanged. In reality, they don't
//   have to be floating point values and any value will work.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  BLENDVPD,     // if (XMM0[63] is 0) then
//                               //  [memory at address RBX][63:0] -> XMM1[63:0]
//                               // if (XMM0[127] is 0) then
//                               //  [memory at address RBX][127:64] -> XMM1[127:64]
//
//  XMM2  XMM1  BLENDVPD,        // if (XMM0[63] is 0) then
//                               //  XMM2[63:0] -> XMM1[63:0]
//                               // if (XMM0[127] is 0) then
//                               //  XMM2[127:64] -> XMM1[127:64]
//
//  XMM2 <- XMM1  BLENDVPD,   // if (XMM0[63] is 0) then
//                               //  XMM1[63:0] -> XMM2[63:0]
//                               // if (XMM0[127] is 0) then
//                               //  XMM1[127:64] -> XMM2[127:64]
//
//  XMM1 XMM8 BLENDVPD,          // if (XMM0[63] is 0) then
//                               //  XMM1[63:0] -> XMM8[63:0]
//                               // if (XMM0[127] is 0) then
//                               //  XMM1[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvblendvpdcomma ( VBLENDVPD, )
//
// C prototype:
//  void dg_forthvblendvpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist
//    targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//   targetxmmregister
//
//  The parameter list for these target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VBLENDVPD instruction. This opcode sequence copies each of 2 64 bit
//   values from the source if the high bit for that 64 bit section in the 
//   register selected by target w matching the position in the 
//   source is set. If the high bit in the 64 bit section in the register 
//   selected by target w is clear, the corresponding 64 bit value 
//   from target y is copied to the destination. If target w is an immediate
//   value, then bits 4 through 7 contain the number of the xmm register 
//   that has the choosing bits. If target w is an xmm register, then this 
//   instruction will convert the xmm register to the correct immediate value 
//   for you. Only bits 63 and 127 of this xmmr register are used to do the
//   choosing. For example, if target w is 30 N and bit 63 of 
//   XMM3 is set, then the lowest 64 bits of the source are copied to the 
//   destination, if bit 63 was clear then the lowest 64 bits of target y are
//   copied to the destination instead. In this case where the immediate value
//   is HEX 30, bit 127 of XMM3  is used to choose between the source and 
//   target y for the highest 64 bits that go to the destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  IF XMM3 = 8000000000000000
//   30 N  RBX [R]  XMM1  XMM0  VBLENDVPD,   // [RBX][63:0]   -> XMM0[63:0]
//                                           // XMM1[127:64]  -> XMM0[127:64]
//  IF XMM3 = 8000000000000000
//   XMM3  RBX [R]  XMM1  XMM0  VBLENDVPD,   // [RBX][63:0]   -> XMM0[63:0]
//                                           // XMM1[127:64]  -> XMM0[127:64]
//
//  IF XMM4 = 80000000000000000000000000000000
//   40 N  RBX [R]  XMM1  XMM0  VBLENDVPD,   // XMM1[63:0]    -> XMM0[63:0]
//                                           // [RBX][127:64] -> XMM0[127:64]
//
//  IF XMM15 = 80000000000000008000000000000000
//   F0 N  RBX [R]  XMM1  XMM0  VBLENDVPD,   // [RBX][127:0]  -> XMM0[127:0]
//
//  IF XMM14 = 80000000000000000000000000000000
//   E0 N  XMM2  XMM1  XMM0  VBLENDVPD,      // XMM1[63:0]    -> XMM0[63:0]
//                                           // XMM2[127:64]  -> XMM0[127:64]
//
//  IF XMM5 = 80000000000000000000000000000000
//   50 N  XMM2 <- XMM1  XMM0  VBLENDVPD, // XMM1[63:0]    -> XMM0[63:0]
//                                           // XMM0[127:64]  -> XMM2[127:64]
//
//  IF XMM6 = 80000000000000000000000000000000
//   60 N  XMM0  XMM1  XMM8 VBLENDVPD,       // XMM1[63:0]    -> XMM0[63:0]
//                                           // XMM0[127:64]  -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0. 
//   In 32 bit address mode bit 7 of the immediate target is ignored.
//   In both 32 bit and 64 bit address mode bits 0, 1, 2, and 3 of the immediate
//    target are ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthblendvpscomma ( BLENDVPS, )
//
// C prototype:
//  void dg_forthblendvpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 BLENDVPS instruction. Intel docs say if bit 31 of XMM0 is 0 then the
//   low floating point single floating point value in the source is copied to the
//   destination, otherwise the low single floating point value in the destination
//   is unchanged. The same thing goes for bits 63, 95, and 127 of XMM0 and the
//   other three single floating point values, if the bit in XMM0 is 0 then
//   the corresponding single floating point value in the source is copied to
//   the destination. In reality, they don't have to be floating point values and
//   any value will work.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  BLENDVPS,     // if (XMM0[31] is 0) then
//                               //  [memory at address RBX][31:0] -> XMM1[31:0]
//                               // if (XMM0[63] is 0) then
//                               //  [memory at address RBX][63:32] -> XMM1[63:32]
//                               // if (XMM0[95] is 0) then
//                               //  [memory at address RBX][95:64] -> XMM1[95:64]
//                               // if (XMM0[127] is 0) then
//                               //  [memory at address RBX][127:96] -> XMM1[127:96]
//
//  XMM2  XMM1  BLENDVPD,        // if (XMM0[31] is 0) then
//                               //  XMM2[31:0] -> XMM1[31:0]
//                               // if (XMM0[63] is 0) then
//                               //  XMM2[63:32] -> XMM1[63:32]
//                               // if (XMM0[95] is 0) then
//                               //  XMM2[95:64] -> XMM1[95:64]
//                               // if (XMM0[127] is 0) then
//                               //  XMM2[127:96] -> XMM1[127:96]
//
//  XMM2 <- XMM1  BLENDVPD,   // if (XMM0[31] is 0) then
//                               //  XMM1[31:0] -> XMM2[31:0]
//                               // if (XMM0[63] is 0) then
//                               //  XMM1[63:32] -> XMM2[63:32]
//                               // if (XMM0[95] is 0) then
//                               //  XMM1[95:64] -> XMM2[95:64]
//                               // if (XMM0[127] is 0) then
//                               //  XMM1[127:96] -> XMM2[127:96]
//
//  XMM1 XMM8 BLENDVPD,          // if (XMM0[31] is 0) then
//                               //  XMM1[31:0] -> XMM8[31:0]
//                               // if (XMM0[63] is 0) then
//                               //  XMM1[63:32] -> XMM8[63:32]
//                               // if (XMM0[95] is 0) then
//                               //  XMM1[95:64] -> XMM8[95:64]
//                               // if (XMM0[127] is 0) then
//                               //  XMM1[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvblendvpscomma ( VBLENDVPS, )
//
// C prototype:
//  void dg_forthvblendvpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//   targetxmmregister
//
//  The parameter list for these target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VBLENDVPD instruction. This opcode sequence copies each of 4 32 bit
//   values from the source if the high bit for that 32 bit section in the 
//   register selected by target w matching the position in the 
//   source is set. If the high bit in the 32 bit section in the register 
//   selected by target w is clear, the corresponding 32 bit value 
//   from target y is copied to the destination. If target w is an immediate
//   value, then bits 4 through 7 contain the number of the xmm register 
//   that has the choosing bits. If target w is an xmm register, then this 
//   instruction will convert the xmm register to the correct immediate value 
//   for you. Only bits 31, 63, 95 and 127 of this xmmr register are used to 
//   do the choosing. For example, if target w is 30 N and bit 63 of 
//   XMM3 is set, then the lowest 64 bits of the source are copied to the 
//   destination, if bit 63 was clear then the lowest 64 bits of target y are
//   copied to the destination instead. In this case where the immediate value
//   is HEX 30, bit 127 of XMM3  is used to choose between the source and 
//   target y for the highest 32 bits that go to the destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  IF XMM3 = 1
//   30 N  RBX [R]  XMM1  XMM0  VBLENDVPS,   // [RBX][31:0]   -> XMM0[31:0]
//                                           // XMM1[127:32]  -> XMM0[127:32]
//
//  IF XMM3 = 1
//   XMM3  RBX [R]  XMM1  XMM0  VBLENDVPS,   // [RBX][31:0]   -> XMM0[31:0]
//                                           // XMM1[127:32]  -> XMM0[127:32]
//
//  IF XMM5 = 2
//   50 N  RBX [R]  XMM1  XMM0  VBLENDVPS,   // XMM1[31:0]   -> XMM0[31:0]
//                                           // [RBX][63:32] -> XMM0[63:32]
//                                           // XMM1[127:64] -> XMM0[127:64]
// 
//  IF XMM6 = 4                           
//   60 N  RBX [R]  XMM1  XMM0  VBLENDVPS,   // XMM1[63:0]   -> XMM0[63:0]
//                                           // [RBX][95:64] -> XMM0[95:64]
//                                           // XMM1[127:96] -> XMM0[127:96]
//
//  IF XMM5 = 8
//   50 N  RBX [R]  XMM1  XMM0  VBLENDVPS,   // XMM1[95:0]   -> XMM0[95:0]
//                                           // [RBX][127:96]  -> XMM0[127:96]
//
//  IF XMM14 = 0F
//   E0 N  RBX [R]  XMM1  XMM0  VBLENDVPS,   // [RBX][127:0]  -> XMM0[127:0]
//
//  IF XMM8 = 0C
//   80 N  RBX [R]  XMM1  XMM0  VBLENDVPS,   // XMM1[63:0]   -> XMM0[63:0]
//                                           // [RBX][127:64] -> XMM0[127:64]
//
//  IF XMM7 = 2
//   70 N  XMM2  XMM1  XMM0  VBLENDVPS,      // XMM1[31:0]   -> XMM0[31:0]
//                                           // XMM2[63:32]  -> XMM0[63:32]
//                                           // XMM1[127:64] -> XMM0[127:64]
//
//  IF XMM4 = 2
//   40 N  XMM0 <- XMM1  XMM2  VBLENDVPS, // XMM1[31:0]   -> XMM0[31:0]
//                                           // XMM2[63:32]  -> XMM0[63:32]
//                                           // XMM1[127:64] -> XMM0[127:64]
//
//  IF XMM7 = 2
//   70 N  XMM0  XMM1  XMM8 VBLENDVPS,       // XMM1[31:0]   -> XMM8[31:0]
//                                           // XMM2[63:32]  -> XMM8[63:32]
//                                           // XMM1[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size,
//   it must be 1 or 0. Intel docs show using an xmm or ymm register instead of 
//   putting the immediate value for the xmm or ymm register. I may add support 
//   for this in the future.
//   In 32 bit address mode bit 7 of the immediate target is ignored.
//   In both 32 bit and 64 bit address mode bits 0, 1, 2, and 3 of the immediate
//    target are ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvbroadcastf128comma ( VBROADCASTF128, )
//
// C prototype:
//  void dg_forthvbroadcastf128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VBROADCASTF128 instruction. This opcode sequence copies 128 bits from
//   the memory target source and puts a copy into both the 
//   low and high 128 bits of the ymm register destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//   RBX [R]  YMM0  VBROADCASTF128,       // [RBX][127:0]  -> YMM0[127:0]
//                                        // [RBX][127:0]  -> YMM0[255:128]
//
//   YMM0 <- RBX [R]  VBROADCASTF128,  // [RBX][127:0]   -> YMM0[127:0]
//                                        // [RBX][127:0]   -> YMM0[255:128]
//
// Note:
//  The source must be a memory target. The destination target must be a ymm
//   register. 
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvbroadcastsdcomma ( VBROADCASTSD, )
//
// C prototype:
//  void dg_forthvbroadcastsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VBROADCASTSD instruction. This opcode sequence copies 64 bits from
//   the memory target or low 64 bits of an xmm register source and puts a copy 
//   into all 4 64 bits sections of the ymm register destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//   RBX [R]  YMM0  VBROADCASTSD,       // [RBX][63:0]  -> YMM0[63:0]
//                                      // [RBX][63:0]  -> YMM0[127:64]
//                                      // [RBX][63:0]  -> YMM0[191:128]
//                                      // [RBX][63:0]  -> YMM0[255:192]
//
//   YMM0 <- XMM1  VBROADCASTSD,     // XMM1[63:0]  -> YMM0[63:0]
//                                      // XMM1[63:0]  -> YMM0[127:64]
//                                      // XMM1[63:0]  -> YMM0[191:128]
//                                      // XMM1[63:0]  -> YMM0[255:192]
//
// Note:
//  The source must be an xmm register or memory target. The destination must 
//   be a ymm register. 
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvbroadcastsscomma ( VBROADCASTSS, )
//
// C prototype:
//  void dg_forthvbroadcastsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VBROADCASTSS instruction. This opcode sequence copies 32 bits from
//   a memory target or the low 32 bits of an xmm register source and puts a 
//   copy into all 32 bit sections of an xmm or ymm register destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//   RBX [R]  YMM0  VBROADCASTSS,       // [RBX][31:0]  -> YMM0[31:0]
//                                      // [RBX][31:0]  -> YMM0[63:32]
//                                      // [RBX][31:0]  -> YMM0[95:64]
//                                      // [RBX][31:0]  -> YMM0[127:96]
//                                      // [RBX][31:0]  -> YMM0[159:128]
//                                      // [RBX][31:0]  -> YMM0[191:160]
//                                      // [RBX][31:0]  -> YMM0[223:192]
//                                      // [RBX][31:0]  -> YMM0[255:224]
//
//   YMM0 <- XMM1  VBROADCASTSS,     // XMM1[31:0]  -> YMM0[31:0]
//                                      // XMM1[31:0]  -> YMM0[63:32]
//                                      // XMM1[31:0]  -> YMM0[95:64]
//                                      // XMM1[31:0]  -> YMM0[127:96]
//                                      // XMM1[31:0]  -> YMM0[159:128]
//                                      // XMM1[31:0]  -> YMM0[191:160]
//                                      // XMM1[31:0]  -> YMM0[223:192]
//                                      // XMM1[31:0]  -> YMM0[255:224]
//
// Note:
//  The source must be an xmm register or memory target. The destination must 
//   be an xmm or ymm register. 
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthblsicomma ( BLSI, )
//
// C prototype:
//  void dg_forthblsicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetxparameterlist and targetyparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetyparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 BLSI instruction. This opcode sequence finds the lowest bit set
//   in the source target, then returns a result to the destination that has
//   that bit and all bits clear.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RAX [R]  ECX  BLSI,  // (0 - [RAX]) AND [RAX] -> ECX
//
//   RAX [R]  RCX  BLSI,  // (0 - [RAX]) AND [RAX] -> RCX
//
//   RAX  RCX  BLSI,      // (0 - RAX) AND RAX -> RCX
//
//   RCX <  RAX  BLSI, // (0 - RAX) AND RAX -> RCX
//
// Note:
//  Putting reverse after any target makes the first target pushed the
//   destination target, and the second target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthblsmskcomma ( BLSMSK, )
//
// C prototype:
//  void dg_forthblsmskcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetxparameterlist and targetyparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetyparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 BLSMSK instruction. This opcode sequence finds the lowest bit set
//   in the source target, then returns a result to the destination that has
//   that bit and all lower bits set, and all higher bits clear.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RAX [R]  ECX  BLSMSK,  // ([RAX] - 1) XOR [RAX] -> ECX
//
//   RAX [R]  RCX  BLSMSK,  // ([RAX] - 1) XOR [RAX] -> RCX
//
//   RAX  RCX  BLSMSK,      // (RAX - 1) XOR RAX -> RCX
//
//   RCX <  RAX  BLSMSK, // (RAX - 1) XOR RAX -> RCX
//
// Note:
//  Putting reverse after any target makes the first target pushed the
//   destination target, and the second target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthblsrcomma ( BLSR, )
//
// C prototype:
//  void dg_forthblsrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetxparameterlist and targetyparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetyparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 BLSR instruction. This opcode sequence finds the lowest bit set
//   in the source target, then returns a result to the destination that has
//   that bit clear, leaving the other bits unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RAX [R]  ECX  BLSR,  // ([RAX] - 1) AND [RAX] -> ECX
//
//   RAX [R]  RCX  BLSR,  // ([RAX] - 1) AND [RAX] -> RCX
//
//   RAX  RCX  BLSR,      // (RAX - 1) AND RAX -> RCX
//
//   RCX <  RAX  BLSR, // (RAX - 1) AND RAX -> RCX
//
// Note:
//  Putting reverse after any target makes the first target pushed the
//   destination target, and the second target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthboundcomma ( BOUND, )
//
// C prototype:
//  void dg_forthboundcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   currentcompilebufferoffset [O]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of these parameters:
//
//   targetregister               one of: AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//   baseregister                 one of: EAX EBX ECX EDX EBP ESI EDI ESP
//                                 or NOREG
//   displacement                 signed 32 bit value
//   absoluteaddress              unsigned 32 bit value
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of: EAX EBX ECX EDX EBP ESI EDI
//                                 or NOREG
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  In 32 bit addressing mode:
//   Pulls two targets from the data stack and compiles the opcode sequence for
//    an x86 BOUND instruction. The array index in the register target is
//    compared against the bounds listed in the memory target. If the array index
//    is out of bounds a range exceeded exception is signaled.
//  Not supported in 64 bit addressing mode.
//
// Note:
//  It doesn't matter which target is first or second, or what direction is
//   specified. The same opcode sequence is compiled regardless. One target
//   has to be a memory target, and the other has to be a register target.
// '
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbsfcomma ( BSF, )
//
// C prototype:
//  void dg_forthbsfcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   currentcompilebufferoffset [O]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of these parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 BSF instruction. The memory target is scanned for the least
//   significant bit set and the result is put into the register target. The
//   size of the memory target scanned is determined by the size of the
//   register target. The zero flag is set if the memory target was 0, and is
//   cleared otherwise.
//
// Note:
//  If you specify a register and a memory target, it doesn't matter which one
//   is target x or target y or what direction is used. The memory target is
//   scanned and the register target receives the result.
//  If you specify two register targets, target x is scanned by default and
//   target y receives the result.
//  If you specify two register targets along with <- then target y is
//   scanned and target x receives the result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  EDX [R]  EAX  BSF,  // scans 32 bit memory at EDX for lowest set bit,
//                      //  the index of this bit is saved into EAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbsrcomma ( BSR, )
//
// C prototype:
//  void dg_forthbsrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   currentcompilebufferoffset [O]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of these parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 BSR instruction. The memory target is scanned for the most
//   significant bit set and the result is put into the register target.
//   The size of the memory target scanned is determined by the size of the
//   register target. The zero flag is set if the memory target was 0, and
//   is cleared otherwise.
//
// Note:
//  If you specify a register and a memory target, it doesn't matter which one
//   is target x or target y or what direction is used. The memory target is
//   scanned and the register target receives the result.
//  If you specify two register targets, target x is scanned by default and
//   target y receives the result.
//  If you specify two register targets along with <- then target y is
//   scanned and target x receives the result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  EDX [R]  EAX  BSR,  // scans 32 bit memory at EDX for highest set bit,
//                      //  the index of this bit is saved into EAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbswapcomma ( BSWAP, )
//
// C prototype:
//  void dg_forthbswapcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//  ( targetparameterlist -- )
//
// Data stack in:
//  targetparameterlist
//
//  The parameter list for the target can contain these addressing mode specifiers:
//   targetregister R
//   targetregister
//
//  Description of these parameters:
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   R                            specifies a register target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 BSWAP instruction, which is a byte swap instruction.
//    This opcode sequence reverses the byte order of the target.
//    This is useful when converting from big endian to little endian,
//    and in 2D graphics applications.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EAX  BSWAP,     // reverses byte order in EAX
//  DX  BSWAP,      // exchanges DL with DH
//  R8  BSWAP,      // reverses byte order in R8
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
 // /////////////////////////////////////////////////////////////////////////////////
 //
 // dg_forthbtcomma ( BT, )
 //
 // C prototype:
 //  void dg_forthbtcomma (Bufferhandle* pBHarrayhead)
 //
 // Inputs:
 //  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
 //                                 the other used as the bufferhandle for the
 //                                 array where bufferhandles are stored.
 //
 // Stack action shorthand:
 //
 //  ( targetxparameterlist targetyparameterlist -- )
 //
 // Data stack in:
 //
 //  targetxparameterlist
 //  targetyparameterlist
 //
 //
 //  The parameter list for a target can contain these addressing mode specifiers:
 //
 //   datavalue N
 //   targetregister
 //   baseregister [R]
 //   baseregister displacement [R+N]
 //   absoluteaddress [N]
 //   baseregister scale indexregister displacement [R+S*R+N]
 //
 //  In 64 bit addressing mode, the parameter list for a target can also
 //   contain this specifier:
 //
 //   currentcompilebufferoffset [O]
 //
 //  If you want more control over how the instruction is encoded,
 //   you can use these addressing mode specifiers instead:
 //
 //   datavalue datasize IMMEDIATE
 //   targetregister R
 //   baseregister displacement minimumdisplacementsize [MOD]
 //   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
 //
 //  If you want to set the direction for modes that allow it, which are modes
 //   with no immediate target, you can use these:
 //   ->
 //   <-
 //
 //  If you need to set the data size for modes requiring it, which are modes
 //   that do not have at least one register target, you can use these:
 //   8BIT
 //   16BIT
 //   32BIT
 //   64BIT
 //
 //  Alternative way to set the data size:
 //   datasizevalue DATASIZE
 //
 //  Description of target parameters:
 //
 //   immediatevalue               signed 32 bit value of an immediate target
 //   targetregister               in both 32 and 64 bit address mode, one of:
 //                                 AL BL CL DL AH BH CH DH
 //                                 AX BX CX DX BP SI DI SP
 //                                 EAX EBX ECX EDX EBP ESI EDI ESP
 //                                in 64 bit address mode, also one of:
 //                                 SPL BPL SIL DIL
 //                                 RAX RBX RCX RDX RBP RSI RDI RSP
 //                                 R8L R9L R10L R11L R12L R13L R14L R15L
 //                                 R8W R9W R10W R11W R12W R13W R14W R15W
 //                                 R8D R9D R10D R11D R12D R13D R14D R15D
 //                                 R8 R9 R10 R11 R12 R13 R14 R15
 //   baseregister                 one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   displacement                 signed 32 bit value (even in 64BIT mode)
 //   absoluteaddress              signed 32 bit value (even in 64BIT mode)
 //   scale                        index register is multiplied by the scale,
 //                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
 //   indexregister                one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //
 //   N                            specifies an immediate target. The value
 //                                 for this target is encoded into the
 //                                 opcode sequence using the smallest size
 //                                 possible.
 //   [R]                          specifies a memory target at the address in
 //                                 the baseregister
 //   [R+N]                        specifies a memory target at the address at
 //                                 the value in the base register plus the
 //                                 signed displacement
 //   [N]                          specifies a memory target at the
 //                                 absoluteaddress which can't be larger than
 //                                 a signed 32 bit integer. This makes [N] not
 //                                 very useful in 64 bit mode
 //   [O]                          specifies a memory target at an offset in the
 //                                 current compile buffer.
 //   [R+S*R+N]                    specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //
 //   ->                        sets the direction to forward.
 //                                 This is the default value.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   <-                        sets the direction to reverse.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   8BIT                         sets the data size of the instruction to
 //                                 1 byte
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   16BIT                        sets the data size of the instruction to
 //                                 2 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   32BIT                        sets the data size of the instruction to
 //                                 4 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   64BIT                        sets the data size of the instruction to
 //                                 8 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   minimumimmediatesize         minimum size used to encode the immediate
 //                                 value in bytes, can be either 0, 1, 2, or 4
 //   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
 //                                 The displacement will not be encoded using
 //                                 a size less than this.
 //   IMMEDIATE                    specifies an immediate target.
 //                                 The value for this target is encoded into
 //                                 the opcode sequence.
 //   R                            specifies a register target. If this is used
 //                                 and a register A opcode sequence for this
 //                                 instruction is available, the register A
 //                                 opcode sequence is NOT used. opcode+r, or
 //                                 modr/m is used instead.
 //                                 R is optional.
 //                                 If a targetregister is specified without R,
 //                                 and AL, AX, EAX, or RAX is specified,
 //                                 and there is a register A opcode sequence
 //                                 available, then the register A opcode
 //                                 sequence is used.
 //   [MOD]                        specifies a memory target at the address of
 //                                 baseregister plus displacement using modr/m
 //                                 encoding. The encoding may be promoted to
 //                                 sib if modr/m does not support it.
 //   [SIB]                        specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //                                 sib encoding will be used.
 //
 //   datasizevalue                data size of the instruction in bytes,
 //                                 can be 1, 2, or 4
 //   DATASIZE                     sets the data size of the instruction
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 // Data stack out:
 //  none
 //
 // Execute state action:
 //  Pulls two targets from the data stack and compiles the opcode sequence for
 //   an x86 BT instruction. This opcode copies a bit from one of the targets
 //   to the carry flag.
 //  If one of the targets is immediate, then that target indexes which
 //   bit to copy in the other target.
 //  If one of the targets is memory, then that target holds the bit
 //   array and the register holds which bit to copy.
 //  If both targets are registers and forward is specified, which
 //   is the default, then the first target holds the bit array, and the
 //   the second target holds which bit to copy.
 //  If both targets are registers and reverse is specified,
 //   then the second target holds the bit array, and the
 //   the first target holds which bit to copy.
 //  Two memory targets are not supported.
 //  Two immediate targets are not supported.
 //
 // Compile state action:
 //  Compiles a call to a subroutine that does the execute state action.
 //
 // Examples:
 //  EAX 5 N BT, // copies bit 5 of EAX to the carry flag
 //  EAX EBX BT, // EBX indexes the bit to copy to the carry flag from EAX
 //  EAX EBX [R] BT, // EAX indexes the bit to copy to the carry flag
 //   from the memory pointed to by EBX
 //
 // Failure cases:
 //  I didn't check the failure cases thoroughly, soo...
 //   you may see some strange things if you are not careful.
 //
 // /////////////////////////////////////////////////////////////////////////////////
 
 // /////////////////////////////////////////////////////////////////////////////////
 //
 // dg_forthbtccomma ( BTC, )
 //
 // C prototype:
 //  void dg_forthbtccomma (Bufferhandle* pBHarrayhead)
 //
 // Inputs:
 //  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
 //                                 the other used as the bufferhandle for the
 //                                 array where bufferhandles are stored.
 //
 // Stack action shorthand:
 //
 //  ( targetxparameterlist targetyparameterlist -- )
 //
 // Data stack in:
 //
 //  targetxparameterlist
 //  targetyparameterlist
 //
 //
 //  The parameter list for a target can contain these addressing mode specifiers:
 //
 //   datavalue N
 //   targetregister
 //   baseregister [R]
 //   baseregister displacement [R+N]
 //   absoluteaddress [N]
 //   baseregister scale indexregister displacement [R+S*R+N]
 //
 //  In 64 bit addressing mode, the parameter list for a target can also
 //   contain this specifier:
 //
 //   currentcompilebufferoffset [O]
 //
 //  If you want more control over how the instruction is encoded,
 //   you can use these addressing mode specifiers instead:
 //
 //   datavalue datasize IMMEDIATE
 //   targetregister R
 //   baseregister displacement minimumdisplacementsize [MOD]
 //   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
 //
 //  If you want to set the direction for modes that allow it, which are modes
 //   with no immediate target, you can use these:
 //   ->
 //   <-
 //
 //  If you need to set the data size for modes requiring it, which are modes
 //   that do not have at least one register target, you can use these:
 //   8BIT
 //   16BIT
 //   32BIT
 //   64BIT
 //
 //  Alternative way to set the data size:
 //   datasizevalue DATASIZE
 //
 //  Description of target parameters:
 //
 //   immediatevalue               signed 32 bit value of an immediate target
 //   targetregister               in both 32 and 64 bit address mode, one of:
 //                                 AL BL CL DL AH BH CH DH
 //                                 AX BX CX DX BP SI DI SP
 //                                 EAX EBX ECX EDX EBP ESI EDI ESP
 //                                in 64 bit address mode, also one of:
 //                                 SPL BPL SIL DIL
 //                                 RAX RBX RCX RDX RBP RSI RDI RSP
 //                                 R8L R9L R10L R11L R12L R13L R14L R15L
 //                                 R8W R9W R10W R11W R12W R13W R14W R15W
 //                                 R8D R9D R10D R11D R12D R13D R14D R15D
 //                                 R8 R9 R10 R11 R12 R13 R14 R15
 //   baseregister                 one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   displacement                 signed 32 bit value (even in 64BIT mode)
 //   absoluteaddress              signed 32 bit value (even in 64BIT mode)
 //   scale                        index register is multiplied by the scale,
 //                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
 //   indexregister                one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //
 //   N                            specifies an immediate target. The value
 //                                 for this target is encoded into the
 //                                 opcode sequence using the smallest size
 //                                 possible.
 //   [R]                          specifies a memory target at the address in
 //                                 the baseregister
 //   [R+N]                        specifies a memory target at the address at
 //                                 the value in the base register plus the
 //                                 signed displacement
 //   [N]                          specifies a memory target at the
 //                                 absoluteaddress which can't be larger than
 //                                 a signed 32 bit integer. This makes [N] not
 //                                 very useful in 64 bit mode
 //   [O]                          specifies a memory target at an offset in the
 //                                 current compile buffer.
 //   [R+S*R+N]                    specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //
 //   ->                        sets the direction to forward.
 //                                 This is the default value.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   <-                        sets the direction to reverse.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   8BIT                         sets the data size of the instruction to
 //                                 1 byte
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   16BIT                        sets the data size of the instruction to
 //                                 2 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   32BIT                        sets the data size of the instruction to
 //                                 4 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   64BIT                        sets the data size of the instruction to
 //                                 8 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   minimumimmediatesize         minimum size used to encode the immediate
 //                                 value in bytes, can be either 0, 1, 2, or 4
 //   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
 //                                 The displacement will not be encoded using
 //                                 a size less than this.
 //   IMMEDIATE                    specifies an immediate target.
 //                                 The value for this target is encoded into
 //                                 the opcode sequence.
 //   R                            specifies a register target. If this is used
 //                                 and a register A opcode sequence for this
 //                                 instruction is available, the register A
 //                                 opcode sequence is NOT used. opcode+r, or
 //                                 modr/m is used instead.
 //                                 R is optional.
 //                                 If a targetregister is specified without R,
 //                                 and AL, AX, EAX, or RAX is specified,
 //                                 and there is a register A opcode sequence
 //                                 available, then the register A opcode
 //                                 sequence is used.
 //   [MOD]                        specifies a memory target at the address of
 //                                 baseregister plus displacement using modr/m
 //                                 encoding. The encoding may be promoted to
 //                                 sib if modr/m does not support it.
 //   [SIB]                        specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //                                 sib encoding will be used.
 //
 //   datasizevalue                data size of the instruction in bytes,
 //                                 can be 1, 2, or 4
 //   DATASIZE                     sets the data size of the instruction
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 // Data stack out:
 //  none
 //
 // Execute state action:
 //  Pulls two targets from the data stack and compiles the opcode sequence for
 //   an x86 BT instruction. This opcode copies a bit from one of the targets
 //   to the carry flag. Then the bit in the target that was copied is flipped.
 //  If one of the targets is immediate, then that target indexes which
 //   bit to copy and flipped in the other target.
 //  If one of the targets is memory, then that target holds the bit
 //   array and the register holds which bit to copy and flipped.
 //  If both targets are registers and forward is specified, which
 //   is the default, then the first target holds the bit array, and the
 //   the second target holds which bit to copy and flipped.
 //  If both targets are registers and reverse is specified,
 //   then the second target holds the bit array, and the
 //   the first target holds which bit to copy and flipped.
 //  Two memory targets are not supported.
 //  Two immediate targets are not supported.
 //
 // Compile state action:
 //  Compiles a call to a subroutine that does the execute state action.
 //
 // Examples:
 //  EAX 5 N BT, // copies bit 5 of EAX to the carry flag
 //              //  then flips bit 5 of EAX
 //  EAX EBX BT, // EBX indexes the bit to copy to the carry flag and flip
 //              //  in EAX
 //  EAX EBX [R] BT, // EAX indexes the bit to copy to the carry flag
 //                  //  and flip in the memory pointed to by EBX
 //
 // Failure cases:
 //  I didn't check the failure cases thoroughly, soo...
 //   you may see some strange things if you are not careful.
 //
 // /////////////////////////////////////////////////////////////////////////////////
 
 // /////////////////////////////////////////////////////////////////////////////////
 //
 // dg_forthbtrcomma ( BTR, )
 //
 // C prototype:
 //  void dg_forthbtrcomma (Bufferhandle* pBHarrayhead)
 //
 // Inputs:
 //  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
 //                                 the other used as the bufferhandle for the
 //                                 array where bufferhandles are stored.
 //
 // Stack action shorthand:
 //
 //  ( targetxparameterlist targetyparameterlist -- )
 //
 // Data stack in:
 //
 //  targetxparameterlist
 //  targetyparameterlist
 //
 //
 //  The parameter list for a target can contain these addressing mode specifiers:
 //
 //   datavalue N
 //   targetregister
 //   baseregister [R]
 //   baseregister displacement [R+N]
 //   absoluteaddress [N]
 //   baseregister scale indexregister displacement [R+S*R+N]
 //
 //  In 64 bit addressing mode, the parameter list for a target can also
 //   contain this specifier:
 //
 //   currentcompilebufferoffset [O]
 //
 //  If you want more control over how the instruction is encoded,
 //   you can use these addressing mode specifiers instead:
 //
 //   datavalue datasize IMMEDIATE
 //   targetregister R
 //   baseregister displacement minimumdisplacementsize [MOD]
 //   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
 //
 //  If you want to set the direction for modes that allow it, which are modes
 //   with no immediate target, you can use these:
 //   ->
 //   <-
 //
 //  If you need to set the data size for modes requiring it, which are modes
 //   that do not have at least one register target, you can use these:
 //   8BIT
 //   16BIT
 //   32BIT
 //   64BIT
 //
 //  Alternative way to set the data size:
 //   datasizevalue DATASIZE
 //
 //  Description of target parameters:
 //
 //   immediatevalue               signed 32 bit value of an immediate target
 //   targetregister               in both 32 and 64 bit address mode, one of:
 //                                 AL BL CL DL AH BH CH DH
 //                                 AX BX CX DX BP SI DI SP
 //                                 EAX EBX ECX EDX EBP ESI EDI ESP
 //                                in 64 bit address mode, also one of:
 //                                 SPL BPL SIL DIL
 //                                 RAX RBX RCX RDX RBP RSI RDI RSP
 //                                 R8L R9L R10L R11L R12L R13L R14L R15L
 //                                 R8W R9W R10W R11W R12W R13W R14W R15W
 //                                 R8D R9D R10D R11D R12D R13D R14D R15D
 //                                 R8 R9 R10 R11 R12 R13 R14 R15
 //   baseregister                 one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   displacement                 signed 32 bit value (even in 64BIT mode)
 //   absoluteaddress              signed 32 bit value (even in 64BIT mode)
 //   scale                        index register is multiplied by the scale,
 //                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
 //   indexregister                one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //
 //   N                            specifies an immediate target. The value
 //                                 for this target is encoded into the
 //                                 opcode sequence using the smallest size
 //                                 possible.
 //   [R]                          specifies a memory target at the address in
 //                                 the baseregister
 //   [R+N]                        specifies a memory target at the address at
 //                                 the value in the base register plus the
 //                                 signed displacement
 //   [N]                          specifies a memory target at the
 //                                 absoluteaddress which can't be larger than
 //                                 a signed 32 bit integer. This makes [N] not
 //                                 very useful in 64 bit mode
 //   [O]                          specifies a memory target at an offset in the
 //                                 current compile buffer.
 //   [R+S*R+N]                    specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //
 //   ->                        sets the direction to forward.
 //                                 This is the default value.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   <-                        sets the direction to reverse.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   8BIT                         sets the data size of the instruction to
 //                                 1 byte
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   16BIT                        sets the data size of the instruction to
 //                                 2 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   32BIT                        sets the data size of the instruction to
 //                                 4 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   64BIT                        sets the data size of the instruction to
 //                                 8 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   minimumimmediatesize         minimum size used to encode the immediate
 //                                 value in bytes, can be either 0, 1, 2, or 4
 //   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
 //                                 The displacement will not be encoded using
 //                                 a size less than this.
 //   IMMEDIATE                    specifies an immediate target.
 //                                 The value for this target is encoded into
 //                                 the opcode sequence.
 //   R                            specifies a register target. If this is used
 //                                 and a register A opcode sequence for this
 //                                 instruction is available, the register A
 //                                 opcode sequence is NOT used. opcode+r, or
 //                                 modr/m is used instead.
 //                                 R is optional.
 //                                 If a targetregister is specified without R,
 //                                 and AL, AX, EAX, or RAX is specified,
 //                                 and there is a register A opcode sequence
 //                                 available, then the register A opcode
 //                                 sequence is used.
 //   [MOD]                        specifies a memory target at the address of
 //                                 baseregister plus displacement using modr/m
 //                                 encoding. The encoding may be promoted to
 //                                 sib if modr/m does not support it.
 //   [SIB]                        specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //                                 sib encoding will be used.
 //
 //   datasizevalue                data size of the instruction in bytes,
 //                                 can be 1, 2, or 4
 //   DATASIZE                     sets the data size of the instruction
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 // Data stack out:
 //  none
 //
 // Execute state action:
 //  Pulls two targets from the data stack and compiles the opcode sequence for
 //   an x86 BT instruction. This opcode copies a bit from one of the targets
 //   to the carry flag. Then the bit in the target that was copied is cleared.
 //  If one of the targets is immediate, then that target indexes which
 //   bit to copy and clear in the other target.
 //  If one of the targets is memory, then that target holds the bit
 //   array and the register holds which bit to copy and clear.
 //  If both targets are registers and forward is specified, which
 //   is the default, then the first target holds the bit array, and the
 //   the second target holds which bit to copy and clear.
 //  If both targets are registers and reverse is specified,
 //   then the second target holds the bit array, and the
 //   the first target holds which bit to copy and clear.
 //  Two memory targets are not supported.
 //  Two immediate targets are not supported.
 //
 // Compile state action:
 //  Compiles a call to a subroutine that does the execute state action.
 //
 // Examples:
 //  EAX 5 N BT, // copies bit 5 of EAX to the carry flag
 //              //  then clears bit 5 of EAX
 //  EAX EBX BT, // EBX indexes the bit to copy to the carry flag and clear
 //              //  in EAX
 //  EAX EBX [R] BT, // EAX indexes the bit to copy to the carry flag
 //                  //  and clear from the memory pointed to by EBX
 //
 // Failure cases:
 //  I didn't check the failure cases thoroughly, soo...
 //   you may see some strange things if you are not careful.
 //
 // /////////////////////////////////////////////////////////////////////////////////
 
 // /////////////////////////////////////////////////////////////////////////////////
 //
 // dg_forthbtscomma ( BTS, )
 //
 // C prototype:
 //  void dg_forthbtscomma (Bufferhandle* pBHarrayhead)
 //
 // Inputs:
 //  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
 //                                 the other used as the bufferhandle for the
 //                                 array where bufferhandles are stored.
 //
 // Stack action shorthand:
 //
 //  ( targetxparameterlist targetyparameterlist -- )
 //
 // Data stack in:
 //
 //  targetxparameterlist
 //  targetyparameterlist
 //
 //
 //  The parameter list for a target can contain these addressing mode specifiers:
 //
 //   datavalue N
 //   targetregister
 //   baseregister [R]
 //   baseregister displacement [R+N]
 //   absoluteaddress [N]
 //   baseregister scale indexregister displacement [R+S*R+N]
 //
 //  In 64 bit addressing mode, the parameter list for a target can also
 //   contain this specifier:
 //
 //   currentcompilebufferoffset [O]
 //
 //  If you want more control over how the instruction is encoded,
 //   you can use these addressing mode specifiers instead:
 //
 //   datavalue datasize IMMEDIATE
 //   targetregister R
 //   baseregister displacement minimumdisplacementsize [MOD]
 //   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
 //
 //  If you want to set the direction for modes that allow it, which are modes
 //   with no immediate target, you can use these:
 //   ->
 //   <-
 //
 //  If you need to set the data size for modes requiring it, which are modes
 //   that do not have at least one register target, you can use these:
 //   16BIT
 //   32BIT
 //   64BIT
 //
 //  Alternative way to set the data size:
 //   datasizevalue DATASIZE
 //
 //  Description of target parameters:
 //
 //   immediatevalue               signed 32 bit value of an immediate target
 //   targetregister               in both 32 and 64 bit address mode, one of:
 //                                 AL BL CL DL AH BH CH DH
 //                                 AX BX CX DX BP SI DI SP
 //                                 EAX EBX ECX EDX EBP ESI EDI ESP
 //                                in 64 bit address mode, also one of:
 //                                 SPL BPL SIL DIL
 //                                 RAX RBX RCX RDX RBP RSI RDI RSP
 //                                 R8L R9L R10L R11L R12L R13L R14L R15L
 //                                 R8W R9W R10W R11W R12W R13W R14W R15W
 //                                 R8D R9D R10D R11D R12D R13D R14D R15D
 //                                 R8 R9 R10 R11 R12 R13 R14 R15
 //   baseregister                 one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   displacement                 signed 32 bit value (even in 64BIT mode)
 //   absoluteaddress              signed 32 bit value (even in 64BIT mode)
 //   scale                        index register is multiplied by the scale,
 //                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
 //   indexregister                one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //
 //   N                            specifies an immediate target. The value
 //                                 for this target is encoded into the
 //                                 opcode sequence using the smallest size
 //                                 possible.
 //   [R]                          specifies a memory target at the address in
 //                                 the baseregister
 //   [R+N]                        specifies a memory target at the address at
 //                                 the value in the base register plus the
 //                                 signed displacement
 //   [N]                          specifies a memory target at the
 //                                 absoluteaddress which can't be larger than
 //                                 a signed 32 bit integer. This makes [N] not
 //                                 very useful in 64 bit mode
 //   [O]                          specifies a memory target at an offset in the
 //                                 current compile buffer.
 //   [R+S*R+N]                    specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //
 //   ->                        sets the direction to forward.
 //                                 This is the default value.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   <-                        sets the direction to reverse.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   16BIT                        sets the data size of the instruction to
 //                                 2 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   32BIT                        sets the data size of the instruction to
 //                                 4 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   64BIT                        sets the data size of the instruction to
 //                                 8 bytes
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   minimumimmediatesize         minimum size used to encode the immediate
 //                                 value in bytes, can be either 0, 1, 2, or 4
 //   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
 //                                 The displacement will not be encoded using
 //                                 a size less than this.
 //   IMMEDIATE                    specifies an immediate target.
 //                                 The value for this target is encoded into
 //                                 the opcode sequence.
 //   R                            specifies a register target. If this is used
 //                                 and a register A opcode sequence for this
 //                                 instruction is available, the register A
 //                                 opcode sequence is NOT used. opcode+r, or
 //                                 modr/m is used instead.
 //                                 R is optional.
 //                                 If a targetregister is specified without R,
 //                                 and AL, AX, EAX, or RAX is specified,
 //                                 and there is a register A opcode sequence
 //                                 available, then the register A opcode
 //                                 sequence is used.
 //   [MOD]                        specifies a memory target at the address of
 //                                 baseregister plus displacement using modr/m
 //                                 encoding. The encoding may be promoted to
 //                                 sib if modr/m does not support it.
 //   [SIB]                        specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //                                 sib encoding will be used.
 //
 //   datasizevalue                data size of the instruction in bytes,
 //                                 can be 1, 2, or 4
 //   DATASIZE                     sets the data size of the instruction
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 // Data stack out:
 //  none
 //
 // Execute state action:
 //  Pulls two targets from the data stack and compiles the opcode sequence for
 //   an x86 BT instruction. This opcode copies a bit from one of the targets
 //   to the carry flag. Then the bit in the target that was copied is set.
 //  If one of the targets is immediate, then that target indexes which
 //   bit to copy and set in the other target.
 //  If one of the targets is memory, then that target holds the bit
 //   array and the register holds which bit to copy and set.
 //  If both targets are registers and forward is specified, which
 //   is the default, then the first target holds the bit array, and the
 //   the second target holds which bit to copy and set.
 //  If both targets are registers and reverse is specified,
 //   then the second target holds the bit array, and the
 //   the first target holds which bit to copy and set.
 //  Two memory targets are not supported.
 //  Two immediate targets are not supported.
 //
 // Compile state action:
 //  Compiles a call to a subroutine that does the execute state action.
 //
 // Examples:
 //  EAX 5 N BT, // copies bit 5 of EAX to the carry flag
 //              //  then sets bit 5 of EAX
 //  EAX EBX BT, // EBX indexes the bit to copy to the carry flag and set
 //              //  in EAX
 //  EAX EBX [R] BT, // EAX indexes the bit to copy to the carry flag
 //                  //  and set from the memory pointed to by EBX
 //
 // Failure cases:
 //  I didn't check the failure cases thoroughly, soo...
 //   you may see some strange things if you are not careful.
 //
 // /////////////////////////////////////////////////////////////////////////////////
 
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbzhicomma ( BZHI, )
//
// C prototype:
//  void dg_forthbzhicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for targetxparameterlist can
//   contain this addressing mode specifiers:
//
//   targetregister
//
//  The parameter list for targetyparameterlist and targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 BZHI instruction. This opcode sequence copies the source then clears
//   the bit at the index specified by the value in target x and clears all 
//   higher bits, then puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   EDX  RAX [R]  ECX  BZHI,  // [RAX][EDX[7:0]-1:0] -> ECX[EDX[7:0]-1:0]
//                              // 0 -> ECX[31:EDX[7:0]]
//
//   RDX  RAX [R]  RCX  BZHI,  // [RAX][RDX[7:0]-1:0] -> RCX[RDX[7:0]-1:0]
//                             // 0 -> RCX[63:RDX[7:0]]
//
//   RDX  RAX  RCX  BZHI,      // RAX[RDX[7:0]-1:0] -> RCX[RDX[7:0]-1:0]
//                             // 0 -> RCX[63:RDX[7:0]]
//
//   RDX  RCX <  RAX  BZHI, // RAX[RDX[7:0]-1:0] -> RCX[RDX[7:0]-1:0]
//                             // 0 -> RCX[63:RDX[7:0]]
//
// Note:
//  Putting reverse after any target makes the second target pushed the
//   destination target, and the third target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcallcomma ( CALL, )
//
// C prototype:
//  void dg_forthcallcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//
//  The parameter list for this target can contain these addressing mode specifiers:
//
//   pcrelativeoffset EIP+N
//   currentcompilebufferoffset O
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   pcrelativeoffset             32 bit value of a pc relative offset
//   currentcompilebufferoffset   32 bit value of an offset in the current
//                                 compile buffer
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   EIP+N                        specifies a pc relative offset
//   O                            specifies an offset in the current compile buffer
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a call. This opcode sequence pushes the address after the opcode
//   sequence to the return stack. The program counter is then loaded with
//   a value based on the type of target:
//   If the target is EIP+N, then the value is added to the address after
//    after the opcode sequence and stored in the program counter.
//   If the target is O, then the offset in the current compile buffer
//    is converted to an EIP+N offset. This offset is added to the address
//    after the opcode sequence and stored in the program counter.
//   If the target is register or memory, then the program counter is loaded
//    with the target's value.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  0 RIP+N  CALL,  // pushes addressafterinstruction to return stack
//
//  -5 RIP+N  CALL, // pushes addressafterinstruction to return stack
//                  // addressofthisinstruction -> RIP
//                  // (this gets processor stuck in loop, fills up
//                  //   return stack, and eventually crashes computer)
//
//  RAX  JMP,       // pushes addressafterinstruction to return stack
//                  // RAX -> RIP
//
//  RSI [R]  JMP,   // pushes addressafterinstruction to return stack
//                  // [RSI] -> RIP
//
//  bufferoffset O JMP,  // pushes addressafterinstruction to return stack    
//                       // RIP+offsettobufferoffset -> RIP
//
//  bufferoffset [O]  JMP,  // pushes addressafterinstruction to return stack
//                          // [RIP+offsettobufferoffset] -> RIP
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcallbracketssplusn16comma ( CALL[SS]+N16, )
//
// C prototype:
//  void dg_forthcallbracketssplusn16comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for this target y can contain a constant value:
//   farsegmentaddress
//
//  The parameter list for this target x can contain these addressing mode specifiers:
//
//   immediatevalue N
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   farsegmentaddress            16 bit integer value
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The immediate
//                                 size is always the same for this instruction
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a 'far' call. This opcode sequence pushes the return information to
//   return stack and loads the program counter and code segment registers 
//   based on the type of target and operating mode of the cpu:
//    If the processor is in real address mode or virtual 8086 mode the program
//     counter is loaded with address from target y, and the code segment register 
//     is loaded with the value from target x
//    If the processor is in protected mode it's kind of complicated. Please
//     refer to the Intel docs for details, but in short target x points to a
//     data structure which contains the information needed to find the true
//     destination. I think the data structure might be stored in the cpu's 
//     memory but I'm not sure. I think target y is an offset that is added
//     to the base address calculated from the data structure but I'm not sure.
//    If the processor is in 64 bit mode it's similar to the protected mode
//     operation except, if the data structure is a call gate then target y's
//     value is not used otherwise target y is used as an offset from a base
//     address which comes from something called a far pointer... which I guess
//     comes from the data structure.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Note:
//  Target y is just a number, you don't need anything like N after it. Since
//   it is always a number I decided to make it so you didn't have to type in 
//   the N. Also target y in this instruction is 16 bits.
//  Target x can not be immediate in 64 bit address mode.
//  This instruction was not tested.
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// /////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcallbracketssplusn32comma ( CALL[SS]+N32, )
//
// C prototype:
//  void dg_forthcallbracketssplusn32comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for this target y can contain a constant value:
//   farsegmentaddress
//
//  The parameter list for this target x can contain these addressing mode specifiers:
//
//   immediatevalue N
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   farsegmentaddress              32 bit integer value
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The immediate
//                                 size is always the same for this instruction
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a 'far' call. This opcode sequence pushes the return information to
//   return stack and loads the program counter and code segment registers 
//   based on the type of target and operating mode of the cpu:
//    If the processor is in real address mode or virtual 8086 mode the program
//     counter is loaded with address from target y, and the code segment register 
//     is loaded with the value from target x
//    If the processor is in protected mode it's kind of complicated. Please
//     refer to the Intel docs for details, but in short target x points to a
//     data structure which contains the information needed to find the true
//     destination. I think the data structure might be stored in the cpu's 
//     memory but I'm not sure. I think target y is an offset that is added
//     to the base address calculated from the data structure but I'm not sure.
//    If the processor is in 64 bit mode it's similar to the protected mode
//     operation except, if the data structure is a call gate then target y's
//     value is not used otherwise target y is used as an offset from a base
//     address which comes from something called a far pointer... which I guess
//     comes from the data structure.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Note:
//  Target y is just a number, you don't need anything like N after it. Since
//   it is always a number I decided to make it so you didn't have to type in 
//   the N. Also target y in this instruction is 32 bits.
//  Target x can not be immediate in 64 bit address mode.
//  This instruction was not tested.
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcallbracketssplusn64comma ( CALL[SS]+N64, )
//
// C prototype:
//  void dg_forthcallbracketssplusn64comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for this target y can contain a constant value:
//   farsegmentaddress
//
//  The parameter list for this target x can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   farsegmentaddress            64 bit integer value
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a 'far' call. This opcode sequence pushes the return information to
//   return stack and loads the program counter and code segment registers 
//   based on the type of target and operating mode of the cpu:
//    If the processor is in real address mode or virtual 8086 mode the program
//     counter is loaded with address from target y, and the code segment register 
//     is loaded with the value from target x
//    If the processor is in protected mode it's kind of complicated. Please
//     refer to the Intel docs for details, but in short target x points to a
//     data structure which contains the information needed to find the true
//     destination. I think the data structure might be stored in the cpu's 
//     memory but I'm not sure. I think target y is an offset that is added
//     to the base address calculated from the data structure but I'm not sure.
//    If the processor is in 64 bit mode it's similar to the protected mode
//     operation except, if the data structure is a call gate then target y's
//     value is not used otherwise target y is used as an offset from a base
//     address which comes from something called a far pointer... which I guess
//     comes from the data structure.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Note:
//  Target y is just a number, you don't need anything like N after it. Since
//   it is always a number I decided to make it so you didn't have to type in 
//   the N. Also target y in this instruction is 64 bits.
//  Target x can not be immediate in 64 bit address mode.
//  This instruction only supported in 64 bit address mode.
//  This instruction was not tested.
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcbwcomma ( CBW, AL->AX, )
//
// C prototype:
//  void dg_forthcbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 CBW instruction.
//  This instructions sign extends AL into AX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AL->AX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcdqcomma ( CDQ, EAX->EDX:EAX, )
//
// C prototype:
//  void dg_forthcdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 CDQ instruction.
//  This instructions sign extends EAX into EDX:EAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  EAX->EDX:EAX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcdqecomma ( CDQE, EAX->RAX, )
//
// C prototype:
//  void dg_forthcdqecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 CDQE instruction.
//  This instructions sign extends EAX into RAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  EAX->RAX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthclaccomma ( CLAC, )
//
// C prototype:
//  void dg_forthclaccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 CLAC instruction. This instruction
//   clears the alignment check flag bit.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  CLAC,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthclccomma ( CLC, )
//
// C prototype:
//  void dg_forthclccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 CLC instruction. This instruction
//   clears the carry flag bit.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  CLC,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcldcomma ( CLD, )
//
// C prototype:
//  void dg_forthcldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 CLD instruction. This instruction
//   clears the direction flag bit. When the direction bit is clear, ESI and EDI
//   increment during string operations.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Example:
//  CLD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthclicomma ( CLI, )
//
// C prototype:
//  void dg_forthclicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 CLD instruction. This instruction
//   clears the interrupt flag bit. When the interrupt bit is clear, interrupts
//   are disabled.
//   This takes effect when the CLI instruction is executed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  CLI,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcltscomma ( CLTS, )
//
// C prototype:
//  void dg_forthcltscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that clears the cr0 register task switch flag.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmccomma ( CMC, )
//
// C prototype:
//  void dg_forthcmccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 CMC instruction.
//  This instruction complements the carry flag bit.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  CMC,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmovcomma ( CMOV, )
//
// C prototype:
//  void dg_forthcmovcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//  ( targetxparameterlist targetyparameterlist conditioncode -- )
//
// Data stack in:
//  targetxparameterlist          usually the source parameter list
//  targetyparameterlist          usually the destination parameter list
//  conditioncode                 0-15, x86 code for conditional instructions
//                                 is one of:
//                                  VS   for overflow set
//                                  NV   for overflow clear or no overflow
//                                  CS   for carry set
//                                  NC   for overflow clear or no carry
//                                  ULT  for unsigned less than
//                                  ULE  for unsigned less than or equal
//                                  UGT  for unsigned greater than
//                                  UGE  for unsigned greater than or equal
//                                  ZS   for zero set
//                                  NZ   for zero clear or no zero
//                                  EQ   for equal or zero set
//                                  NE   for not equal or zero clear
//                                  SS   for sign set
//                                  NS   for no sign or sign clear
//                                  MI   for minus or sign set
//                                  PL   for plus or sign clear
//                                  PS   for parity set
//                                  NP   for no parity or parity clear
//                                  LT   for signed less than
//                                  GE   for signed greater than or equal
//                                  LE   for signed less than or equal
//                                  GT   for signed greater than
//                                  ALWAYS
//                                  NEVER
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target.
//                                 will not effect encoding since
//                                 opcode plus register and register a
//                                 encoding are not available for this
//                                 instruction
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence
//   for an x86 CMOV instruction. This opcode sequence checks the flag register
//   for the condition, and if it is true the memory target is copied to the
//   register target.
//
// Note:
//  If you specify a register and a memory target, it doesn't matter which one
//   is target x or target y or what direction is used. The memory target is
//   the source and the register target is the destination.
//  If you specify two register targets, target x is the source and target y
//   is the destination.
//  If you specify two register targets along with <- then target y is
//   source and target x is the destination.
//  Datasize commands are not needed because there is always at least one
//   register target
//  8 bit data size is not supported.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EBX [R]  AX  NZ CMOV,       // if the zero flag clear, copies the 16 bit
//                              //  value in memory at the address in EBX to AX
//  ECX  EAX  CS CMOV,          // if the carry flag is set, copies ECX to EAX
//  ECX <- EAX  CC CMOV,     // if the carry flag is clear, copies EAX to ECX
//  RCX  RAX  MI CMOV,          // if the sign flag is set, copies RCX to RAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some strange
//   things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpcomma ( CMP, )
//
// C prototype:
//  void dg_forthcmpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. If N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CMP instruction. This sequence evalutes the destination target 
//   minus the source  target and changes the condition code flags accordingly.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  CMP,  // subtracts 12348000 from EAX, result is
//                             //  not stored, only flags are set
//  27 N  CL  CMP,             // subtracts 27 from CL, result is not stored
//                             //  only flags are set
//  AX  EBX [R]  CMP,          // subtracts AX from the 16 bit memory at
//                             //  the address in EBX, result is not stored, 
//                             //  only flags are set
//  38 N  EDX [R]  32BIT CMP,  // size required, subtracts 38 from the 32 bit
//                             //  memory at the address in EDX, result is not
//                             //  stored, only flags are set
//  ECX  EAX  CMP,             // subtracts ECX from EAX, result is not stored,
//                             //  only flags are set
//  ECX <- EAX  CMP,        // subtracts EAX from ECX, result is not stored,
//                             //  only flags are set
//  38 N  EAX R  CMP,          // subtracts 38 from RAX using the n8 to n32
//                             //  sign extended modr/m encoding, result is not
//                             //  stored, only flags are set
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem CMP, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] CMP,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmppdcomma ( CMPPD, )
//
// C prototype:
//  void dg_forthcmppdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 CMPPD instruction. This opcode sequence compares each double
//   precision floating point value in the destination with the corresponding
//   double precision floating point values in the source based on the
//   comparison mode in the immediate value and replaces the destination
//   values with a true flag, which is 64 1s, or a false flag, which is 64 0s,
//   based on the result of the comparisons. For these comparisons, +0 = -0.
//
//   Immediate value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  CMPPD,   // if [RBX][63:0] = XMM0[63:0] then
//                                    1 -> XMM0[63:0]
//                                // else 0 -> XMM0[63:0]
//                                // if [RBX][127:64] = XMM0[127:64] then
//                                    1 -> XMM0[127:64]
//                                // else 0 -> XMM0[127:64]
//
//  02 N  RBX [R]  XMM0  CMPPD,   // if [RBX][63:0] < XMM0[63:0] then
//                                    1 -> XMM0[63:0]
//                                // else 0 -> XMM0[63:0]
//                                // if [RBX][127:64] < XMM0[127:64] then
//                                    1 -> XMM0[127:64]
//                                // else 0 -> XMM0[127:64]
//
//  03 N  RBX [R]  XMM0  CMPPD,   // if [RBX][63:0] <= XMM0[63:0] then
//                                    1 -> XMM0[63:0]
//                                // else 0 -> XMM0[63:0]
//                                // if [RBX][127:64] <= XMM0[127:64] then
//                                    1 -> XMM0[127:64]
//                                // else 0 -> XMM0[127:64]
//
//  04 N  RBX [R]  XMM0  CMPPD,   // if [RBX][63:0] != XMM0[63:0] then
//                                    1 -> XMM0[63:0]
//                                // else 0 -> XMM0[63:0]
//                                // if [RBX][127:64] != XMM0[127:64] then
//                                    1 -> XMM0[127:64]
//                                // else 0 -> XMM0[127:64]
//
//  02 N  XMM2  XMM0  CMPPD,      // if XMM2[63:0] < XMM0[63:0] then
//                                    1 -> XMM0[63:0]
//                                // else 0 -> XMM0[63:0]
//                                // if XMM2[127:64] < XMM0[127:64] then
//                                    1 -> XMM0[127:64]
//                                // else 0 -> XMM0[127:64]
//
//  02 N  XMM2 <- XMM0  CMPPD, // if XMM0[63:0] < XMM2[63:0] then
//                                    1 -> XMM2[63:0]
//                                // else 0 -> XMM2[63:0]
//                                // if XMM0[127:64] < XMM2[127:64] then
//                                    1 -> XMM2[127:64]
//                                // else 0 -> XMM2[127:64]
//
//  02 N  XMM0  XMM8 CMPPD,       // if XMM0[63:0] < XMM8[63:0] then
//                                    1 -> XMM8[63:0]
//                                // else 0 -> XMM8[63:0]
//                                // if XMM0[127:64] < XMM8[127:64] then
//                                    1 -> XMM8[127:64]
//                                // else 0 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcmppdcomma ( VCMPPD, )
//
// C prototype:
//  void dg_forthvcmppdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for target x and target z can 
//   also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VCMPPD instruction. This opcode sequence compares each double
//   precision floating point value in target y with the corresponding
//   double precision floating point values in the source based on the
//   comparison mode in the target w value replaces the destination values
//   with a true flag, which is 64 1s, or a false flag, which is 64 0s,
//   based on the result of the comparisons. For these comparisons, +0 = -0.
//
//   Target w value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VCMPPD, // if [RBX][63:0] = XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//                                     // if [RBX][127:64] = XMM1[127:64] then
//                                     //  1 -> XMM0[127:64]
//                                     // else 0 -> XMM0[127:64]
//
//  02 N  RBX [R]  XMM1  XMM0  VCMPPD, // if [RBX][63:0] < XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//                                     // if [RBX][127:64] < XMM1[127:64] then
//                                     //  1 -> XMM0[127:64]
//                                     // else 0 -> XMM0[127:64]
//
//  03 N  RBX [R]  XMM1  XMM0  VCMPPD, // if [RBX][63:0] <= XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//                                     // if [RBX][127:64] <= XMM1[127:64] then
//                                     //  1 -> XMM0[127:64]
//                                     // else 0 -> XMM0[127:64]
//
//  04 N  RBX [R]  XMM1  XMM0  VCMPPD, // if [RBX][63:0] != XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//                                     // if [RBX][127:64] != XMM1[127:64] then
//                                     //  1 -> XMM0[127:64]
//                                     // else 0 -> XMM0[127:64]
//
//  02 N  XMM2  XMM1  XMM0  VCMPPD,    // if XMM2[63:0] < XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//                                     // if XMM2[127:64] < XMM1[127:64] then
//                                     //  1 -> XMM0[127:64]
//                                     // else 0 -> XMM0[127:64]
//
//  02 N  XMM2 <- XMM1  XMM0  VCMPPD, 
//                                // if XMM0[63:0] < XMM1[63:0] then
//                                //  1 -> XMM2[63:0]
//                                // else 0 -> XMM2[63:0]
//                                // if XMM0[127:64] < XMM1[127:64] then
//                                //  1 -> XMM2[127:64]
//                                // else 0 -> XMM2[127:64]
//
//  02 N  XMM0  XMM1  XMM8 VCMPPD, // if XMM0[63:0] < XMM1[63:0] then
//                                 //  1 -> XMM8[63:0]
//                                 // else 0 -> XMM8[63:0]
//                                 // if XMM0[127:64] < XMM1[127:64] then
//                                 //  1 -> XMM8[127:64]
//                                 // else 0 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmppscomma ( CMPPS, )
//
// C prototype:
//  void dg_forthcmppscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetw can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 CMPPS instruction. This opcode sequence compares each single
//   precision floating point value in the destination with the corresponding
//   single precision floating point values in the source based on the
//   comparison mode in the immediate value and replaces the destination
//   values with a true flag, which is 32 1s, or a false flag, which is 32 0s,
//   based on the result of the comparisons. For these comparisons, +0 = -0.
//
//   Immediate value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  CMPPS,   // if [RBX][31:0] = XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if [RBX][63:31] = XMM0[63:31] then
//                                    1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if [RBX][95:64] = XMM0[95:64] then
//                                    1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if [RBX][127:96] = XMM0[127:96] then
//                                    1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
//  02 N  RBX [R]  XMM0  CMPPS,   // if [RBX][31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if [RBX][63:31] < XMM0[63:31] then
//                                    1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if [RBX][95:64] < XMM0[95:64] then
//                                    1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if [RBX][127:96] < XMM0[127:96] then
//                                    1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
//  03 N  RBX [R]  XMM0  CMPPS,   // if [RBX][31:0] <= XMM0[31:0]then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if [RBX][63:31] <= XMM0[63:31] then
//                                    1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if [RBX][95:64] <= XMM0[95:64]then
//                                    1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if [RBX][127:96] <= XMM0[127:96] then
//                                    1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
//  04 N  RBX [R]  XMM0  CMPPS,   // if [RBX][31:0] != XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if [RBX][63:31] != XMM0[63:31] then
//                                    1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if [RBX][95:64] != XMM0[95:64] then
//                                    1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if [RBX][127:96] != XMM0[127:96] then
//                                    1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
//  02 N  XMM2  XMM0  CMPPS,      // if XMM2[31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if XMM2[63:31] < XMM0[63:31] then
//                                    1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if XMM2[95:64] < XMM0[95:64] then
//                                    1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if XMM2[127:96] < XMM0[127:96] then
//                                    1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM2  CMPPS, // if XMM2[31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if XMM2[63:31] < XMM0[63:31] then
//                                    1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if XMM2[95:64] < XMM0[95:64] then
//                                    1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if XMM2[127:96] < XMM0[127:96] then
//                                    1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
//  02 N  XMM8  XMM0 CMPPS,       // if XMM8[31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if XMM8[63:31] < XMM0[63:31] then
//                                    1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if XMM8[95:64] < XMM0[95:64] then
//                                    1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if XMM8[127:96] < XMM0[127:96] then
//                                    1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcmppscomma ( VCMPPS, )
//
// C prototype:
//  void dg_forthvcmppscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for target x and target z can 
//   also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VCMPPS instruction. This opcode sequence compares each single
//   precision floating point value in target y with the corresponding
//   single precision floating point values in the source based on the
//   comparison mode in the target w value and replaces the destination
//   values with a true flag, which is 32 1s, or a false flag, which is 32 0s,
//   based on the result of the comparisons. For these comparisons, +0 = -0.
//
//   Immediate value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VCMPPS, // if [RBX][31:0] = XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//                                     // if [RBX][63:31] = XMM1[63:31] then
//                                     //  1 -> XMM0[63:31]
//                                     // else 0 -> XMM0[63:31]
//                                     // if [RBX][95:64] = XMM1[95:64] then
//                                     //  1 -> XMM0[95:64]
//                                     // else 0 -> XMM0[95:64]
//                                     // if [RBX][127:96] = XMM1[127:96] then
//                                     //  1 -> XMM0[127:96]
//                                     // else 0 -> XMM0[127:96]
//
//  02 N  RBX [R]  XMM1  XMM0  VCMPPS, // if [RBX][31:0] < XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//                                     // if [RBX][63:31] < XMM1[63:31] then
//                                     //  1 -> XMM0[63:31]
//                                     // else 0 -> XMM0[63:31]
//                                     // if [RBX][95:64] < XMM1[95:64] then
//                                     //  1 -> XMM0[95:64]
//                                     // else 0 -> XMM0[95:64]
//                                     // if [RBX][127:96] < XMM1[127:96] then
//                                     //  1 -> XMM0[127:96]
//                                     // else 0 -> XMM0[127:96]
//
//  03 N  RBX [R]  XMM1  XMM0  VCMPPS, // if [RBX][31:0] <= XMM1[31:0]then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//                                     // if [RBX][63:31] <= XMM1[63:31] then
//                                     //  1 -> XMM0[63:31]
//                                     // else 0 -> XMM0[63:31]
//                                     // if [RBX][95:64] <= XMM1[95:64]then
//                                     //  1 -> XMM0[95:64]
//                                     // else 0 -> XMM0[95:64]
//                                     // if [RBX][127:96] <= XMM1[127:96] then
//                                     //  1 -> XMM0[127:96]
//                                     // else 0 -> XMM0[127:96]
//
//  04 N  RBX [R]  XMM1  XMM0  VCMPPS, // if [RBX][31:0] != XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//                                     // if [RBX][63:31] != XMM1[63:31] then
//                                     //  1 -> XMM0[63:31]
//                                     // else 0 -> XMM0[63:31]
//                                     // if [RBX][95:64] != XMM1[95:64] then
//                                     //  1 -> XMM0[95:64]
//                                     // else 0 -> XMM0[95:64]
//                                     // if [RBX][127:96] != XMM1[127:96] then
//                                     //  1 -> XMM0[127:96]
//                                     // else 0 -> XMM0[127:96]
//
//  02 N  XMM2  XMM1  XMM0  VCMPPS,    // if XMM2[31:0] < XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//                                     // if XMM2[63:31] < XMM1[63:31] then
//                                     //  1 -> XMM0[63:31]
//                                     // else 0 -> XMM0[63:31]
//                                     // if XMM2[95:64] < XMM1[95:64] then
//                                     //  1 -> XMM0[95:64]
//                                     // else 0 -> XMM0[95:64]
//                                     // if XMM2[127:96] < XMM1[127:96] then
//                                     //  1 -> XMM0[127:96]
//                                     // else 0 -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM1  XMM2  VCMPPS,  
//                                // if XMM2[31:0] < XMM1[31:0] then
//                                //  1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//                                // if XMM2[63:31] < XMM1[63:31] then
//                                //  1 -> XMM0[63:31]
//                                // else 0 -> XMM0[63:31]
//                                // if XMM2[95:64] < XMM1[95:64] then
//                                //  1 -> XMM0[95:64]
//                                // else 0 -> XMM0[95:64]
//                                // if XMM2[127:96] < XMM1[127:96] then
//                                //  1 -> XMM0[127:96]
//                                // else 0 -> XMM0[127:96]
//
//  02 N  XMM8  XMM1  XMM0  VCMPPS, // if XMM8[31:0] < XMM1[31:0] then
//                                  //  1 -> XMM0[31:0]
//                                  // else 0 -> XMM0[31:0]
//                                  // if XMM8[63:31] < XMM1[63:31] then
//                                  //  1 -> XMM0[63:31]
//                                  // else 0 -> XMM0[63:31]
//                                  // if XMM8[95:64] < XMM1[95:64] then
//                                  //  1 -> XMM0[95:64]
//                                  // else 0 -> XMM0[95:64]
//                                  // if XMM8[127:96] < XMM1[127:96] then
//                                  //  1 -> XMM0[127:96]
//                                  // else 0 -> XMM0[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpsbcomma ( CMPSB, )
//
// C prototype:
//  void dg_forthcmpsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 CMPSB instruction.
//  In 32 bit addressing mode:
//   Decrements ECX and compares an 8 bit value by doing [EDI] - [ESI].
//   EDI and ESI are adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Decrements RCX and compares an 8 bit value by doing [RDI] - [RSI].
//   RDI and RSI are adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit addressing mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  NZUNTILREP,  CMPSB,  compares count bytes at src with dest until
//                        a non match is found
//
// 64 bit addressing mode example:
//  src RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  NZUNTILREP,  CMPSB,  compares count bytes at src with dest until
//                        a non match is found
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpsdcomma ( CMPSD, )
//
// C prototype:
//  void dg_forthcmpsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 CMPSD instruction.
//  In 32 bit addressing mode:
//   Decrements ECX and compares a 32 bit value by doing [EDI] - [ESI].
//   EDI and ESI are adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//  In 64 bit addressing mode:
//   Decrements RCX and compares a 32 bit value by doing [RDI] - [RSI].
//   RDI and RSI are adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addressing mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  NZUNTILREP,  CMPSD,  compares count 32 bit units at src with dest until
//                        a non match is found
//
// 64 bit addressing mode example:
//  src  RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  NZUNTILREP,  CMPSD,  compares count 32 bit units at src with dest until
//                        a non match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpsd2comma ( CMPSD2, )
//
// C prototype:
//  void dg_forthcmpsd2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparemeterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 CMPSD2 instruction. This opcode sequence compares the double
//   precision floating point value in the lower 64 bits of the destination with
//   the corresponding double precision floating point values in the lower 64
//   bits of the source based on the comparison mode in the immediate value and
//   replaces the destination value with a true flag, which is 64 1s, or a false
//   flag, which is 64 0s, based on the result of the comparison.
//   For these comparisons, +0 = -0.
//   In the Intel manual, this instruction is call CMPSD. I changed the name
//    because it conflicts with the compare string function called CMPSD.
//
//   Immediate value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  CMPSD2,   // if [RBX][63:0] = XMM0[63:0] then
//                                     1 -> XMM0[63:0]
//                                 // else 0 -> XMM0[63:0]
//
//  02 N  RBX [R]  XMM0  CMPSD2,   // if [RBX][63:0] < XMM0[63:0] then
//                                     1 -> XMM0[63:0]
//                                 // else 0 -> XMM0[63:0]
//
//  03 N  RBX [R]  XMM0  CMPSD2,   // if [RBX][63:0] <= XMM0[63:0] then
//                                     1 -> XMM0[63:0]
//                                 // else 0 -> XMM0[63:0]
//
//  04 N  RBX [R]  XMM0  CMPSD2,   // if [RBX][63:0] != XMM0[63:0] then
//                                     1 -> XMM0[63:0]
//                                 // else 0 -> XMM0[63:0]
//
//  02 N  XMM2  XMM0  CMPSD2,      // if XMM2[63:0] < XMM0[63:0] then
//                                     1 -> XMM0[63:0]
//                                 // else 0 -> XMM0[63:0]
//
//  02 N  XMM2 <- XMM0  CMPSD2, // if XMM0[63:0] < XMM2[63:0] then
//                                     1 -> XMM2[63:0]
//                                 // else 0 -> XMM2[63:0]
//
//  02 N  XMM0  XMM8 CMPSD2,       // if XMM0[63:0] < XMM8[63:0] then
//                                     1 -> XMM8[63:0]
//                                 // else 0 -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcmpsdcomma ( VCMPSD, )
//
// C prototype:
//  void dg_forthvcmpsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for target x and target z can 
//   also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VCMPSD instruction. This opcode sequence compares the double
//   precision floating point value in the lower 64 bits of the target y with
//   the corresponding double precision floating point values in the lower 64
//   bits of the source based on the comparison mode in target w and
//   replaces the destination value with a true flag, which is 64 1s, or a false
//   flag, which is 64 0s, based on the result of the comparison.
//   For these comparisons, +0 = -0.
//
//   Immediate value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VCMPSD, // if [RBX][63:0] = XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//
//  02 N  RBX [R]  XMM1  XMM0  VCMPSD, // if [RBX][63:0] < XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//
//  03 N  RBX [R]  XMM1  XMM0  VCMPSD, // if [RBX][63:0] <= XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//
//  04 N  RBX [R]  XMM1  XMM0  VCMPSD, // if [RBX][63:0] != XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//
//  02 N  XMM2  XMM1  XMM0  VCMPSD,    // if XMM2[63:0] < XMM1[63:0] then
//                                     //  1 -> XMM0[63:0]
//                                     // else 0 -> XMM0[63:0]
//
//  02 N  XMM2 <- XMM1  XMM0  VCMPSD, // if XMM0[63:0] < XMM1[63:0] then
//                                       // 1 -> XMM2[63:0]
//                                       // else 0 -> XMM2[63:0]
//
//  02 N  XMM0  XMM1  XMM8 VCMPSD,     // if XMM0[63:0] < XMM1[63:0] then
//                                     //  1 -> XMM8[63:0]
//                                     // else 0 -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm, or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpsqcomma ( CMPSQ, )
//
// C prototype:
//  void dg_forthcmpsqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 CMPSD instruction.
//  Not available in 32 bit addressing mode:
//  In 64 bit addressing mode:
//   Decrements RCX and compares a 64 bit value by doing [RDI] - [RSI].
//   RDI and RSI are adjusted according to the direction flag,
//    clear adds 8 and set subtracts 8.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 64 bit addressing mode example:
//  src  RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  NZUNTILREP,  CMPSQ,  compares count 64 bit units at src with dest until
//                        a non match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpsscomma ( CMPSS, )
//
// C prototype:
//  void dg_forthcmpsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 CMPSS instruction. This opcode sequence compares the single
//   precision floating point value in the lower 32 bits of the destination
//   with the corresponding single precision floating point values in the lower
//   32 bits of the source based on the comparison mode in the immediate value
//   and replaces the destination value with a true flag, which is 32 1s, or a
//   false flag, which is 32 0s, based on the result of the comparisons.
//   For these comparisons, +0 = -0.
//
//   Immediate value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  CMPSS,   // if [RBX][31:0] = XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//
//  02 N  RBX [R]  XMM0  CMPSS,   // if [RBX][31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//
//  03 N  RBX [R]  XMM0  CMPSS,   // if [RBX][31:0] <= XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//
//  04 N  RBX [R]  XMM0  CMPSS,   // if [RBX][31:0] != XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//
//  02 N  XMM2  XMM0  CMPSS,      // if XMM2[31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//
//  02 N  XMM0 <- XMM2  CMPSS, // if XMM2[31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//
//  02 N  XMM8  XMM0 CMPSS,       // if XMM8[31:0] < XMM0[31:0] then
//                                    1 -> XMM0[31:0]
//                                // else 0 -> XMM0[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcmpsscomma ( VCMPSS, )
//
// C prototype:
//  void dg_forthvcmpsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for target x and target z can 
//   also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VCMPSS instruction. This opcode sequence compares the single
//   precision floating point value in the lower 32 bits of target y
//   with the corresponding single precision floating point values in the lower
//   32 bits of the source based on the comparison mode in target w
//   and replaces the destination value with a true flag, which is 32 1s, or a
//   false flag, which is 32 0s, based on the result of the comparisons.
//   For these comparisons, +0 = -0.
//
//   Immediate value, comparison mode:
//    0                true if destination = source
//    1                true if destination < source
//    2                true if destination <= source
//    3                true if destination or source is NaN
//    4                true if destination does not = source
//    5                false if destination < source
//                      true if either destination or source is NaN
//    6                false if destination <= source
//                      true if either destination or source is NaN
//    7                true if both destination and source is NaN
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VCMPSS, // if [RBX][31:0] = XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//
//  02 N  RBX [R]  XMM1  XMM0  VCMPSS, // if [RBX][31:0] < XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//
//  03 N  RBX [R]  XMM1  XMM0  VCMPSS, // if [RBX][31:0] <= XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//
//  04 N  RBX [R]  XMM1  XMM0  VCMPSS, // if [RBX][31:0] != XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//
//  02 N  XMM2  XMM1  XMM0  VCMPSS,    // if XMM2[31:0] < XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//
//  02 N  XMM0 <- XMM1  XMM2  VCMPSS, // if XMM2[31:0] < XMM1[31:0] then
//                                       //  1 -> XMM0[31:0]
//                                       // else 0 -> XMM0[31:0]
//
//  02 N  XMM8  XMM1  XMM0 VCMPSS,     // if XMM8[31:0] < XMM1[31:0] then
//                                     //  1 -> XMM0[31:0]
//                                     // else 0 -> XMM0[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpswcomma ( CMPSW, )
//
// C prototype:
//  void dg_forthcmpswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for an x86 CMPSW instruction prefix.
//  In 32 bit addressing mode:
//   Decrements ECX and compares a 16 bit value by doing [EDI] - [ESI].
//   EDI and ESI are adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//  In 64 bit addressing mode:
//   Decrements RCX and compares a 16 bit value by doing [RDI] - [RSI].
//   RDI and RSI are adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addressing mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  NZUNTILREP,  CMPSW,  compares count 16 bit units at src with dest until
//                        a non match is found
//
// 64 bit addressing mode example:
//  src  RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  NZUNTILREP,  CMPSW,  compares count 16 bit units at src with dest until
//                        a non match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpxchgcomma ( CMPXCHG, )
//
// C prototype:
//  void dg_forthcmpxchgcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   currentcompilebufferoffset [O]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of these parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CMPXCHG instruction.
//   If the memory target is equal to register a, then the register target
//   is copied to the memory target and the zero flag is set.
//   If the memory target is not equal to register a, then the memory target
//   is copied to register a and the zero flag is cleared.
//
// Note:
//  If you specify a register and a memory target, it doesn't matter which one
//   is target x or target y or what direction is used. The memory target is
//   scanned and the register target receives the result.
//  If you specify two register targets, target x is scanned by default and
//   target y receives the result.
//  If you specify two register targets along with <- then target y is
//   scanned and target x receives the result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpxchg16bcomma ( CMPXCHG16B, )
//
// C prototype:
//  void dg_forthcmpxchg16bcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//
//
//  The parameter list for the target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of these parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 CMPXCHG16B instruction.
//   If the 128 bit value in the memory target is equal to the contents of
//   what is in the register pair RDX:RAX, then the zero flag is set and the
//   contents of RCX:RBX are copied to to the memory target. Otherwise, the
//   zero flag is cleared and the contents of the memory target are copied to
//   RDX:RAX. The other flags are not affected.
//
// 64 bit address mode example:
//  RDI [R]  CMPXCHG16B,         // if [RDI][127:0] = RDX:RAX then
//                               //  RCX:RBX -> [RDI][127:0]
//                               // else
//                               //  [RDI][127:0] -> RDX:RAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcmpxchg8bcomma ( CMPXCHG8B, )
//
// C prototype:
//  void dg_forthcmpxchg8bcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//
//
//  The parameter list for the target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of these parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 CMPXCHG8B instruction.
//   If the 64 bit value in the memory target is equal to the contents of
//   what is in the register pair EDX:EAX, then the zero flag is set and the
//   contents of ECX:EBX are copied to to the memory target. Otherwise, the
//   zero flag is cleared and the contents of the memory target are copied to
//   EDX:EAX. The other flags are not affected.
//
// 64 bit address mode example:
//  RDI [R]  CMPXCHG8B,          // if [RDI][63:0] = EDX:EAX then
//                               //  ECX:EBX -> [RDI][63:0]
//                               // else
//                               //  [RDI][63:0] -> EDX:EAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcomisdcomma ( COMISD, )
//
// C prototype:
//  void dg_forthcomisdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 COMISD instruction. This sequence compares the low floating point
//   double precision value in the destination target with the low floating point
//   double precision value in the source target and sets the condition code flags
//   based on the result. The source and destination are not changed.
//   If the destination was < source, then the carry flag is set, otherwise
//    the carry flag is cleared.
//   If the destination equals the source, then the zero flag is set, otherwise
//    the zero flag is cleared.
//   If the result of the compare is unordered, which means it couldn't do the
//    compare because one or both values were the not a number value, or not a
//    floating point value, or something else, then the parity bit is set, otherwise
//    the parity bit is cleared.
//   Intel docs say floating point exceptions can be generated if the number is
//    SNan, QNan, or Denormal.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  COMISD,     // [RBX] fsub XMM0[63:0] (only flags are changed)
//  XMM2  XMM0  COMISD,        // XMM2[63:0] fsub XMM0[63:0] (only flags are changed)
//  XMM2 <-  XMM0  COMISD,  // XMM0[63:0] fsub XMM2[63:0] (only flags are changed)
//  XMM0  XMM8  COMISD,        // XMM0[63:0] fsub XMM8[63:0] (only flags are changed)
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcomisdcomma ( VCOMISD, )
//
// C prototype:
//  void dg_forthvcomisdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCOMISD instruction. This sequence compares the low floating point
//   double precision value in the destination target with the low floating point
//   double precision value in the source target and sets the condition code flags
//   based on the result. The source and destination are not changed.
//   If the destination was < source, then the carry flag is set, otherwise
//    the carry flag is cleared.
//   If the destination equals the source, then the zero flag is set, otherwise
//    the zero flag is cleared.
//   If the result of the compare is unordered, which means it couldn't do the
//    compare because one or both values were the not a number value, or not a
//    floating point value, or something else, then the parity bit is set, otherwise
//    the parity bit is cleared.
//   Intel docs say floating point exceptions can be generated if the number is
//    SNan, QNan, or Denormal.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCOMISD,     // [RBX] fsub XMM0[63:0] (only flags are changed)
//  XMM2  XMM0  VCOMISD,        // XMM2[63:0] fsub XMM0[63:0] (only flags are changed)
//  XMM2 <-  XMM0  VCOMISD,  // XMM0[63:0] fsub XMM2[63:0] (only flags are changed)
//  XMM0  XMM8  VCOMISD,        // XMM0[63:0] fsub XMM8[63:0] (only flags are changed)
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcomisscomma ( COMISS, )
//
// C prototype:
//  void dg_forthcomisscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 COMISS instruction. This sequence compares the low floating point
//   single precision value in the destination target with the low floating point
//   single precision value in the source target and sets the condition code flags
//   based on the result. The source and destination are not changed.
//   If the destination was < source, then the carry flag is set, otherwise
//    the carry flag is cleared.
//   If the destination equals the source, then the zero flag is set, otherwise
//    the zero flag is cleared.
//   If the result of the compare is unordered, which means it couldn't do the
//    compare because one or both values were the not a number value, or not a
//    floating point value, or something else, then the parity bit is set, otherwise
//    the parity bit is cleared.
//   Intel docs say floating point exceptions can be generated if the number is
//    SNan, QNan, or Denormal.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  COMISS,     // [RBX] fsub XMM0[31:0] (only flags are changed)
//  XMM2  XMM0  COMISS,        // XMM2[31:0] fsub XMM0[31:0] (only flags are changed)
//  XMM2 <-  XMM0  COMISS,  // XMM0[31:0] fsub XMM2[31:0] (only flags are changed)
//  XMM0  XMM8  COMISS,        // XMM0[31:0] fsub XMM8[31:0] (only flags are changed)
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcomisscomma ( VCOMISS, )
//
// C prototype:
//  void dg_forthvcomisscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCOMISS instruction. This sequence compares the low floating point
//   single precision value in the destination target with the low floating point
//   single precision value in the source target and sets the condition code flags
//   based on the result. The source and destination are not changed.
//   If the destination was < source, then the carry flag is set, otherwise
//    the carry flag is cleared.
//   If the destination equals the source, then the zero flag is set, otherwise
//    the zero flag is cleared.
//   If the result of the compare is unordered, which means it couldn't do the
//    compare because one or both values were the not a number value, or not a
//    floating point value, or something else, then the parity bit is set, otherwise
//    the parity bit is cleared.
//   Intel docs say floating point exceptions can be generated if the number is
//    SNan, QNan, or Denormal.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCOMISS,     // [RBX] fsub XMM0[31:0] (only flags are changed)
//  XMM2  XMM0  VCOMISS,        // XMM2[31:0] fsub XMM0[31:0] (only flags are changed)
//  XMM2 <-  XMM0  VCOMISS,  // XMM0[31:0] fsub XMM2[31:0] (only flags are changed)
//  XMM0  XMM8  VCOMISS,        // XMM0[31:0] fsub XMM8[31:0] (only flags are changed)
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcpuidcomma ( CPUID, )
//
// C prototype:
//  void dg_forthcpuidcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 CPUID instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  CPUID,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcqocomma ( CQO, RAX->RDX:RAX, )
//
// C prototype:
//  void dg_forthcqocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 CQO instruction.
//  This instructions sign extends RAX into RDX:RAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  RAX->RDX:RAX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcrc32comma ( CRC32, )
//
// C prototype:
//  void dg_forthcrc32comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, you can use this:
//   16BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CRC32 instruction. This opcode sequence incorporates the source
//   target value into the current crc total in the destination. The size for
//   this instruction is taken from the source, which means if you are using
//   memory target for the source, then you have to specify a size.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   CX   CRC32,     //  CRC32(CX, [RAX]) -> CX
//
// Note:
//  Intel docs show an 8 bit source to 64 bit destination mode for this
//   instruction using the REX.W prefix that zero extends the 32 bit result
//   to 64 bits. The 8 bit to 32 bit mode does not show zero extension, but
//   don't all writes to a 32 bit register get automatically zero extended to
//   64 bits? Because of this, I did not support the REX.W 8 bit to 64 bit
//   mode. If you want to use the 8 bit to 64 bit mode just put the REX.W
//   prefix in front of the 8 bit to 32 bit version of this instruction.
//   For example:
//    48 CODE-U8,  RAX [R]  AL  CCR32,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtdq2pdcomma ( CVTDQ2PD, )
//
// C prototype:
//  void dg_forthcvtdq2pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTDQ2PD instruction. This sequence converts 2 signed 32 bit
//   integers from the low 64 bits of the source into 2 double floating point
//   values and puts them in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTDQ2PD,     // [RBX][31:0] INT32->FP64 -> XMM0[63:0]
//                               // [RBX][63:32] INT32->FP64 -> XMM0[127:64]
//
//  XMM2  XMM0  CVTDQ2PD,        // XMM2[31:0] INT32->FP64 -> XMM0[63:0]
//                               // XMM2[63:32] INT32->FP64 -> XMM0[127:64]
//
//  XMM2 <-  XMM0  CVTDQ2PD,  // XMM0[31:0] INT32->FP64 -> XMM2[63:0]
//                               // XMM0[63:32] INT32->FP64 -> XMM2[127:64]
//
//  XMM0  XMM8  CVTDQ2PD,        // XMM0[31:0] INT32->FP64 -> XMM8[63:0]
//                               // XMM0[63:32] INT32->FP64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtdq2pdcomma ( VCVTDQ2PD, )
//
// C prototype:
//  void dg_forthvcvtdq2pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTDQ2PD instruction. This sequence converts 2 or 4 signed 32 bit
//   integers from the low 64 bits of the source into 2 or 4 double floating point
//   values and puts them in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTDQ2PD,    // [RBX][31:0] INT32->FP64 -> XMM0[63:0]
//                               // [RBX][63:32] INT32->FP64 -> XMM0[127:64]
//
//  XMM2  XMM0  VCVTDQ2PD,       // XMM2[31:0] INT32->FP64 -> XMM0[63:0]
//                               // XMM2[63:32] INT32->FP64 -> XMM0[127:64]
//
//  XMM2 <-  XMM0  VCVTDQ2PD, // XMM0[31:0] INT32->FP64 -> XMM2[63:0]
//                               // XMM0[63:32] INT32->FP64 -> XMM2[127:64]
//
//  XMM0  YMM8  VCVTDQ2PD,      // XMM0[31:0]   INT32->FP64 -> YMM8[63:0]
//                              // XMM0[63:32]  INT32->FP64 -> YMM8[127:64]
//                              // XMM0[95:64]  INT32->FP64 -> YMM8[191:128]
//                              // XMM0[127:96] INT32->FP64 -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtdq2pscomma ( CVTDQ2PS, )
//
// C prototype:
//  void dg_forthcvtdq2pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTDQ2PS instruction. This sequence converts 4 signed 32 bit
//   integers from the source into 4 single floating point values and puts
//   them in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTDQ2PS,     // [RBX][31:0] INT32->FP32 -> XMM0[31:0]
//                               // [RBX][63:32] INT32->FP32 -> XMM0[63:32]
//                               // [RBX][95:64] INT32->FP32 -> XMM0[95:64]
//                               // [RBX][127:96] INT32->FP32 -> XMM0[127:96]
//
//  XMM2  XMM0  CVTDQ2PS,        // XMM2[31:0] INT32->FP32 -> XMM0[31:0]
//                               // XMM2[63:32] INT32->FP32 -> XMM0[63:32]
//                               // XMM2[95:64] INT32->FP32 -> XMM0[95:64]
//                               // XMM2[127:96] INT32->FP32 -> XMM0[127:96]
//
//  XMM2 <-  XMM0  CVTDQ2PS,  // XMM0[31:0] INT32->FP32 -> XMM2[31:0]
//                               // XMM0[63:32] INT32->FP32 -> XMM2[63:32]
//                               // XMM0[95:64] INT32->FP32 -> XMM2[95:64]
//                               // XMM0[127:96] INT32->FP32 -> XMM2[127:96]
//
//  XMM0  XMM8  CVTDQ2PS,        // XMM0[31:0] INT32->FP32 -> XMM8[31:0]
//                               // XMM0[63:32] INT32->FP32 -> XMM8[63:32]
//                               // XMM0[95:64] INT32->FP32 -> XMM8[95:64]
//                               // XMM0[127:96] INT32->FP32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtdq2pscomma ( VCVTDQ2PS, )
//
// C prototype:
//  void dg_forthvcvtdq2pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTDQ2PS instruction. This sequence converts 4 or 8 signed 32 bit
//   integers from the source into 4 or 8 single floating point values and puts
//   them in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTDQ2PS,    // [RBX][31:0] INT32->FP32 -> XMM0[31:0]
//                               // [RBX][63:32] INT32->FP32 -> XMM0[63:32]
//                               // [RBX][95:64] INT32->FP32 -> XMM0[95:64]
//                               // [RBX][127:96] INT32->FP32 -> XMM0[127:96]
//
//  XMM2  XMM0  VCVTDQ2PS,       // XMM2[31:0] INT32->FP32 -> XMM0[31:0]
//                               // XMM2[63:32] INT32->FP32 -> XMM0[63:32]
//                               // XMM2[95:64] INT32->FP32 -> XMM0[95:64]
//                               // XMM2[127:96] INT32->FP32 -> XMM0[127:96]
//
//  XMM2 <-  XMM0  VCVTDQ2PS, // XMM0[31:0] INT32->FP32 -> XMM2[31:0]
//                               // XMM0[63:32] INT32->FP32 -> XMM2[63:32]
//                               // XMM0[95:64] INT32->FP32 -> XMM2[95:64]
//                               // XMM0[127:96] INT32->FP32 -> XMM2[127:96]
//
//  YMM0  YMM8  VCVTDQ2PS,       // YMM0[31:0] INT32->FP32 -> YMM8[31:0]
//                               // YMM0[63:32] INT32->FP32 -> YMM8[63:32]
//                               // YMM0[95:64] INT32->FP32 -> YMM8[95:64]
//                               // YMM0[127:96] INT32->FP32 -> YMM8[127:96]
//                               // YMM0[159:128] INT32->FP32 -> YMM8[159:128]
//                               // YMM0[191:160] INT32->FP32 -> YMM8[191:160]
//                               // YMM0[223:192] INT32->FP32 -> YMM8[223:192]
//                               // YMM0[255:224] INT32->FP32 -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtpd2dqcomma ( CVTPD2DQ, )
//
// C prototype:
//  void dg_forthcvtpd2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPD2DQ instruction. This sequence converts 2 double floating point
//   values from the source into 2 signed 32 bit integers and puts them into the
//   low 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTPD2DQ,     // [RBX][63:0] FP64->INT32 -> XMM0[31:0]
//                               // [RBX][127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2  XMM0  CVTPD2DQ,        // XMM2[63:0] FP64->INT32 -> XMM0[31:0]
//                               // XMM2[127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2 <-  XMM0  CVTPD2DQ,  // XMM0[63:0] FP64->INT32 -> XMM2[31:0]
//                               // XMM0[127:64] FP64->INT32 -> XMM2[63:32]
//
//  XMM0  XMM8  CVTPD2DQ,        // XMM0[63:0] FP64->INT32 -> XMM8[31:0]
//                               // XMM0[127:64] FP64->INT32 -> XMM8[63:32]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtpd2dqcomma ( VCVTPD2DQ, )
//
// C prototype:
//  void dg_forthvcvtpd2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTPD2DQ instruction. This sequence converts 2 or 4 double floating 
//   point values from the source into 2 or 4 signed 32 bit integers and puts 
//   them into the low 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTPD2DQ,    // [RBX][63:0]   FP64->INT32 -> XMM0[31:0]
//                               // [RBX][127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2  XMM0  VCVTPD2DQ,       // XMM2[63:0]   FP64->INT32 -> XMM0[31:0]
//                               // XMM2[127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2 <-  XMM0  VCVTPD2DQ, // XMM0[63:0]   FP64->INT32 -> XMM2[31:0]
//                               // XMM0[127:64] FP64->INT32 -> XMM2[63:32]
//
//  YMM0  YMM8  VCVTPD2DQ,       // YMM0[63:0]    FP64->INT32 -> YMM8[31:0]
//                               // YMM0[127:64]  FP64->INT32 -> YMM8[63:32]
//                               // YMM0[191:128] FP64->INT32 -> YMM8[95:64]
//                               // YMM0[255:192] FP64->INT32 -> YMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtph2pscomma ( VCVTPH2PS, )
//
// C prototype:
//  void dg_forthvcvtph2pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTPH2PS instruction. This sequence converts 4 or 8 16 bit floating 
//   point values from the source into 4 or 8 signed 32 bit integers and puts 
//   them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTPD2DQ,    // [RBX][15:0]  FP16->FP32 -> XMM0[31:0]
//                               // [RBX][31:16] FP16->FP32 -> XMM0[63:32]
//                               // [RBX][47:32] FP16->FP32 -> XMM0[95:64]
//                               // [RBX][63:48] FP16->FP32 -> XMM0[127:96]
//
//  XMM2  XMM0  VCVTPD2DQ,       // XMM2[15:0]  FP16->FP32 -> XMM0[31:0]
//                               // XMM2[31:16] FP16->FP32 -> XMM0[63:32]
//                               // XMM2[47:32] FP16->FP32 -> XMM0[95:64]
//                               // XMM2[63:48] FP16->FP32 -> XMM0[127:96]
//
//  XMM0 <-  XMM2  VCVTPD2DQ, // XMM2[15:0]  FP16->FP32 -> XMM0[31:0]
//                               // XMM2[31:16] FP16->FP32 -> XMM0[63:32]
//                               // XMM2[47:32] FP16->FP32 -> XMM0[95:64]
//                               // XMM2[63:48] FP16->FP32 -> XMM0[127:96]
//
//  YMM0  YMM8  VCVTPD2DQ,       // YMM0[15:0]    FP16->FP32 -> YMM8[31:0]
//                               // YMM0[31:16]   FP16->FP32 -> YMM8[63:32]
//                               // YMM0[47:32]   FP16->FP32 -> YMM8[95:64]
//                               // YMM0[63:48]   FP16->FP32 -> YMM8[127:96]
//                               // YMM0[79:64]   FP16->FP32 -> YMM8[159:128]
//                               // YMM0[95:80]   FP16->FP32 -> YMM8[191:160]
//                               // YMM0[111:96]  FP16->FP32 -> YMM8[223:192]
//                               // YMM0[127:112] FP16->FP32 -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtpd2picomma ( CVTPD2PI, )
//
// C prototype:
//  void dg_forthcvtpd2picomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPD2PI instruction. This opcode sequence converts two double
//   precision floating point values from an xmm register or memory source
//   into two signed 32 bit integers and puts them into the
//   destination floating point register. Rounding uses the MXCSR register setting.
//   Reverse is not supported for CVTPD2PI,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   XMM0   CVTPD2PI,     //  XMM0[63:0]   FP64->I32 -> [RAX][31:0]
//                                 //  XMM0[127:64] FP64->I32 -> [RAX][63:32]
//
//  ST1   XMM0   CVTPD2PI,         //  XMM0[63:0]   FP64->I32 -> ST1[31:0]
//                                 //  XMM0[127:64] FP64->I32 -> ST1[63:32]
//
// Note:
//  You do not need to specify a data size for memory target destinations. If you do,
//   it must be 64BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtpd2pscomma ( CVTPD2PS, )
//
// C prototype:
//  void dg_forthcvtpd2pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPD2PS instruction. This sequence converts 2 double floating point
//   values from the source into 2 single floating point values and puts them
//   into the low 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTPD2PS,     // [RBX][63:0] FP64->FP32 -> XMM0[31:0]
//                               // [RBX][127:64] FP64->FP32 -> XMM0[63:32]
//
//  XMM2  XMM0  CVTPD2PS,        // XMM2[63:0] FP64->FP32 -> XMM0[31:0]
//                               // XMM2[127:64] FP64->FP32 -> XMM0[63:32]
//
//  XMM2 <-  XMM0  CVTPD2PS,  // XMM0[63:0] FP64->FP32 -> XMM2[31:0]
//                               // XMM0[127:64] FP64->FP32 -> XMM2[63:32]
//
//  XMM0  XMM8  CVTPD2PS,        // XMM0[63:0] FP64->FP32 -> XMM8[31:0]
//                               // XMM0[127:64] FP64->FP32 -> XMM8[63:32]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtpi2pdcomma ( CVTPI2PD, )
//
// C prototype:
//  void dg_forthcvtpi2pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPI2PD instruction. This opcode sequence converts two signed 32 bit
//   integers from the memory or floating point register source to two double
//   precision floating point values and puts them into the destination.
//   Reverse is not supported for CVTPI2PD,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   XMM0   CVTPI2PD,     //  [RAX][31:0]  I32->FP64 -> XMM0[63:0]
//                                 //  [RAX][63:32] I32->FP64 -> XMM0[127:64]
//
//  ST1   XMM0   CVTPI2PD,         //  ST1[31:0]  I32->FP64 -> XMM0[63:0]
//                                 //  ST1[63:32] I32->FP64 -> XMM0[127:64]
//
// Note:
//  You do not need to specify a data size for memory target sources. If you do,
//   it must be 64BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtpi2pscomma ( CVTPI2PS, )
//
// C prototype:
//  void dg_forthcvtpi2pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPI2PS instruction. This opcode sequence converts two signed 32 bit
//   integers from the memory or floating point register source to two single
//   precision floating point values and puts them into the destination xmm
//   register.
//   Reverse is not supported for CVTPI2PS,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   XMM0   CVTPI2PS,     //  [RAX][31:0]  I32->FP32 -> XMM0[31:0]
//                                 //  [RAX][63:32] I32->FP32 -> XMM0[63:32]
//
//  ST1   XMM0   CVTPI2PS,         //  ST1[31:0]  I32->FP32 -> XMM0[31:0]
//                                 //  ST1[63:32] I32->FP32 -> XMM0[63:32]
//
// Note:
//  You do not need to specify a data size for memory target sources. If you do,
//   it must be 64BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtps2dqcomma ( CVTPS2DQ, )
//
// C prototype:
//  void dg_forthcvtps2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPS2DQ instruction. This sequence converts 4 floating point
//   single precision values from the source into 4 signed doubleword integer
//   values and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTPS2DQ,     // [RBX][31:0] FP32->INT32 -> XMM0[31:0]
//                               // [RBX][63:32] FP32->INT32 -> XMM0[63:32]
//                               // [RBX][95:64] FP32->INT32 -> XMM0[95:64]
//                               // [RBX][127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2  XMM0  CVTPS2DQ,        // XMM2[31:0] FP32->INT32 -> XMM0[31:0]
//                               // XMM2[63:32] FP32->INT32 -> XMM0[63:32]
//                               // XMM2[95:64] FP32->INT32 -> XMM0[95:64]
//                               // XMM2[127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2 <-  XMM0  CVTPS2DQ,  // XMM0[31:0] FP32->INT32 -> XMM2[31:0]
//                               // XMM0[63:32] FP32->INT32 -> XMM2[63:32]
//                               // XMM0[95:64] FP32->INT32 -> XMM2[95:64]
//                               // XMM0[127:96] FP32->INT32 -> XMM2[127:96]
//
//  XMM0  XMM8  CVTPS2DQ,        // XMM0[31:0] FP32->INT32 -> XMM8[31:0]
//                               // XMM0[63:32] FP32->INT32 -> XMM8[63:32]
//                               // XMM0[95:64] FP32->INT32 -> XMM8[95:64]
//                               // XMM0[127:96] FP32->INT32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtps2dqcomma ( VCVTPS2DQ, )
//
// C prototype:
//  void dg_forthvcvtps2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTPS2DQ instruction. This sequence converts 4 or 8 floating point
//   single precision values from the source into 4 or 8 signed doubleword integer
//   values and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTPS2DQ,    // [RBX][31:0]   FP32->INT32 -> XMM0[31:0]
//                               // [RBX][63:32]  FP32->INT32 -> XMM0[63:32]
//                               // [RBX][95:64]  FP32->INT32 -> XMM0[95:64]
//                               // [RBX][127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2  XMM0  VCVTPS2DQ,       // XMM2[31:0]   FP32->INT32 -> XMM0[31:0]
//                               // XMM2[63:32]  FP32->INT32 -> XMM0[63:32]
//                               // XMM2[95:64]  FP32->INT32 -> XMM0[95:64]
//                               // XMM2[127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2 <-  XMM0  VCVTPS2DQ, // XMM0[31:0]   FP32->INT32 -> XMM2[31:0]
//                               // XMM0[63:32]  FP32->INT32 -> XMM2[63:32]
//                               // XMM0[95:64]  FP32->INT32 -> XMM2[95:64]
//                               // XMM0[127:96] FP32->INT32 -> XMM2[127:96]
//
//  YMM0  YMM8  VCVTPS2DQ,       // YMM0[31:0]    FP32->INT32 -> YMM8[31:0]
//                               // YMM0[63:32]   FP32->INT32 -> YMM8[63:32]
//                               // YMM0[95:64]   FP32->INT32 -> YMM8[95:64]
//                               // YMM0[127:96]  FP32->INT32 -> YMM8[127:96]
//                               // YMM0[159:128] FP32->INT32 -> YMM8[159:128]
//                               // YMM0[191:160] FP32->INT32 -> YMM8[191:160]
//                               // YMM0[223:192] FP32->INT32 -> YMM8[223:192]
//                               // YMM0[255:224] FP32->INT32 -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtps2pdcomma ( CVTPS2PD, )
//
// C prototype:
//  void dg_forthcvtps2pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPS2PD instruction. This sequence converts 2 single precision
//   floating point values from the lower 64 bits of the source into 2 double
//   precision floating point values and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTPS2PD,     // [RBX][31:0] FP32->FP64 -> XMM0[63:0]
//                               // [RBX][63:32] FP32->FP64 -> XMM0[127:64]
//
//  XMM2  XMM0  CVTPS2PD,        // XMM2[31:0] FP32->FP64 -> XMM0[63:0]
//                               // XMM2[63:32] FP32->FP64 -> XMM0[127:64]
//
//  XMM2 <-  XMM0  CVTPS2PD,  // XMM0[31:0] FP32->FP64 -> XMM2[63:0]
//                               // XMM0[63:32] FP32->FP64 -> XMM2[127:64]
//
//  XMM0  XMM8  CVTPS2PD,        // XMM0[31:0] FP32->FP64 -> XMM8[63:0]
//                               // XMM0[63:32] FP32->FP64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtps2pdcomma ( VCVTPS2PD, )
//
// C prototype:
//  void dg_forthvcvtps2pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTPS2PD instruction. This sequence converts 2 or 4 single precision
//   floating point values from the lower half of the source into 2 or 4 double
//   precision floating point values and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTPS2PD,     // [RBX][31:0] FP32->FP64 -> XMM0[63:0]
//                                // [RBX][63:32] FP32->FP64 -> XMM0[127:64]
//
//  XMM2  XMM0  VCVTPS2PD,        // XMM2[31:0] FP32->FP64 -> XMM0[63:0]
//                                // XMM2[63:32] FP32->FP64 -> XMM0[127:64]
//
//  XMM2 <-  XMM0  VCVTPS2PD,  // XMM0[31:0] FP32->FP64 -> XMM2[63:0]
//                                // XMM0[63:32] FP32->FP64 -> XMM2[127:64]
//
//  YMM0  YMM8  VCVTPS2PD,        // YMM0[31:0]   FP32->FP64 -> YMM8[63:0]
//                                // YMM0[63:32]  FP32->FP64 -> YMM8[127:64]
//                                // YMM0[95:64]  FP32->FP64 -> YMM8[195:128]
//                                // YMM0[127:96] FP32->FP64 -> YMM8[255:196]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtps2phcomma ( VCVTPS2PH, )
//
// C prototype:
//  void dg_forthvcvtps2phcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTPS2PH instruction. This sequence converts 4 or 8 single precision
//   floating point values from the lower half of the source into 4 or 8 half
//   precision floating point values and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTPS2PH,     // [RBX][31:0]   FP32->FP16 -> XMM0[15:0]
//                                // [RBX][63:32]  FP32->FP16 -> XMM0[31:16]
//                                // [RBX][95:64]  FP32->FP16 -> XMM0[47:32]
//                                // [RBX][127:96] FP32->FP16 -> XMM0[63:48]
//
//  XMM2  XMM0  VCVTPS2PH,        // XMM2[31:0]   FP32->FP16 -> XMM0[15:0]
//                                // XMM2[63:32]  FP32->FP16 -> XMM0[31:16]
//                                // XMM2[95:64]  FP32->FP16 -> XMM0[47:32]
//                                // XMM2[127:96] FP32->FP16 -> XMM0[63:48]
//
//  XMM0 <-  XMM2  VCVTPS2PH,  // XMM2[31:0]   FP32->FP16 -> XMM0[15:0]
//                                // XMM2[63:32]  FP32->FP16 -> XMM0[31:16]
//                                // XMM2[95:64]  FP32->FP16 -> XMM0[47:32]
//                                // XMM2[127:96] FP32->FP16 -> XMM0[63:48]
//
//  YMM0  YMM8  VCVTPS2PH,        // YMM0[31:0]    FP32->FP16 -> YMM8[15:0]
//                                // YMM0[63:32]   FP32->FP16 -> YMM8[31:16]
//                                // YMM0[95:64]   FP32->FP16 -> YMM8[47:32]
//                                // YMM0[127:96]  FP32->FP16 -> YMM8[63:48]
//                                // YMM0[159:128] FP32->FP16 -> YMM8[79:64]
//                                // YMM0[191:160] FP32->FP16 -> YMM8[95:80]
//                                // YMM0[223:192] FP32->FP16 -> YMM8[111:96]
//                                // YMM0[255:224] FP32->FP16 -> YMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The source target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtps2picomma ( CVTPS2PI, )
//
// C prototype:
//  void dg_forthcvtps2picomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTPS2PI instruction. This opcode sequence converts two single
//   precision floating point values from an xmm register or memory source
//   into two signed 32 bit integers and puts them into the
//   destination floating point register. Rounding uses the MXCSR register setting.
//   Reverse is not supported for CVTPS2PI,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   XMM0   CVTPS2PI,  //  XMM0[31:0]  FP32->I32 -> [RAX][31:0]
//                              //  XMM0[63:32] FP32->I32 -> [RAX][63:32]
//
//  ST1   XMM0   CVTPS2PI,      //  XMM0[31:0]  FP32->I32 -> ST1[31:0]
//                              //  XMM0[63:32] FP32->I32 -> ST1[63:32]
//
// Note:
//  You do not need to specify a data size for memory target destinations. If 
//   you do, it must be 64BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtsd2sicomma ( CVTSD2SI, )
//
// C prototype:
//  void dg_forthcvtsd2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTSD2SI instruction. This opcode sequence gets a double
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer. Rounding uses the MXCSR register setting.
//   Reverse is not supported for CVTSD2SI,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   RCX   CVTSD2SI,      //  [RAX][63:0]  FP64->I64 -> RCX[63:0]
//
//  XMM0  ECX   CVTSD2SI,          //  XMM0[63:0]   FP64->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this compiling
//   word. The source size is ignored and set to the destination size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtsd2sicomma ( VCVTSD2SI, )
//
// C prototype:
//  void dg_forthvcvtsd2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTSD2SI instruction. This opcode sequence gets a double
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer. Rounding uses the MXCSR register setting.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   RCX   VCVTSD2SI,  //  [RAX][63:0]  FP64->I64 -> RCX[63:0]
//
//  XMM0  ECX   VCVTSD2SI,      //  XMM0[63:0]   FP64->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this compiling
//   word. The source size is ignored and set to the destination size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtsd2sscomma ( CVTSD2SS, )
//
// C prototype:
//  void dg_forthcvtsd2sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTSD2SS instruction. This sequence converts a double precision
//   floating point value from the lower 64 bits of the source into a single
//   precision floating point value and puts it into the lower 32 bits of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTSD2SS,     // [RBX][63:0] FP64->FP32 -> XMM0[31:0]
//
//  XMM2  XMM0  CVTSD2SS,        // XMM2[63:0] FP64->FP32 -> XMM0[31:0]
//
//  XMM2 <-  XMM0  CVTSD2SS,  // XMM0[63:0] FP64->FP32 -> XMM2[31:0]
//
//  XMM0  XMM8  CVTSD2SS,        // XMM0[63:0] FP64->FP32 -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtsd2sscomma ( VCVTSD2SS, )
//
// C prototype:
//  void dg_forthvcvtsd2sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTSD2SS instruction. This sequence converts a double precision
//   floating point value from the lower 64 bits of the source into a single
//   precision floating point value and puts it into the lower 32 bits of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTSD2SS,     // [RBX][63:0] FP64->FP32 -> XMM0[31:0]
//
//  XMM2  XMM0  VCVTSD2SS,        // XMM2[63:0] FP64->FP32 -> XMM0[31:0]
//
//  XMM2 <-  XMM0  VCVTSD2SS,  // XMM0[63:0] FP64->FP32 -> XMM2[31:0]
//
//  XMM0  XMM8  VCVTSD2SS,        // XMM0[63:0] FP64->FP32 -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtsi2sdcomma ( CVTSI2SD, )
//
// C prototype:
//  void dg_forthcvtsi2sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTSI2SD instruction. This opcode sequence converts a single
//   signed integer value from a source register or memory target
//   into a double floating point value and and puts it into the
//   destination xmm register. The source register can be a 32 or 64 bit
//   register. If you do not specify the data size for a memory target,
//   the address mode size is used as the default data size.
//   Reverse is not supported for CVTSI2SD,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  64BIT RAX [R]  XMM0      CVTSI2SD,   //  [RAX][63:0]  I64->FP64 -> XMM0[63:0]
//
//  ECX  XMM0  CVTSI2SD,                 //  ECX[31:0]    I32->FP64 -> XMM0[63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtsi2sdcomma ( VCVTSI2SD, )
//
// C prototype:
//  void dg_forthvcvtsi2sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTSI2SD instruction. This opcode sequence converts a single
//   signed integer value from a source register or memory target
//   into a double floating point value and and puts it into the
//   destination xmm register. The source register can be a 32 or 64 bit
//   register. If you do not specify the data size for a memory target,
//   the address mode size is used as the default data size.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  64BIT RAX [R]  XMM0  VCVTSI2SD,  //  [RAX][63:0]  I64->FP64 -> XMM0[63:0]
//
//  ECX  XMM0  VCVTSI2SD,            //  ECX[31:0]    I32->FP64 -> XMM0[63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtsi2sscomma ( CVTSI2SS, )
//
// C prototype:
//  void dg_forthcvtsi2sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTSI2SS instruction. This opcode sequence gets a single
//   signed integer value from a source register or memory target and converts
//   it into a single floating point value and and puts it into the
//   destination xmm register. The source register can be a 32 or 64 bit
//   register. If you do not specify the data size for a memory target,
//   the address mode size is used as the default data size.
//   Reverse is not supported for CVTSI2SS,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  64BIT RAX [R]  XMM0      CVTSI2SS,   //  [RAX][63:0]  I64->FP32 -> XMM0[31:0]
//
//  ECX  XMM0  CVTSI2SS,                 //  ECX[31:0]    I32->FP32 -> XMM0[31:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtsi2sscomma ( VCVTSI2SS, )
//
// C prototype:
//  void dg_forthvcvtsi2sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTSI2SS instruction. This opcode sequence gets a single
//   signed integer value from a source register or memory target and converts
//   it into a single floating point value and and puts it into the
//   destination xmm register. The source register can be a 32 or 64 bit
//   register. If you do not specify the data size for a memory target,
//   the address mode size is used as the default data size.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  64BIT RAX [R]  XMM0  VCVTSI2SS,  //  [RAX][63:0]  I64->FP32 -> XMM0[31:0]
//
//  ECX  XMM0  VCVTSI2SS,            //  ECX[31:0]    I32->FP32 -> XMM0[31:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtss2sdcomma ( CVTSS2SD, )
//
// C prototype:
//  void dg_forthcvtss2sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTSS2SD instruction. This sequence converts a single precision
//   floating point value from the lower 32 bits of the source into a double
//   precision floating point value and puts it into the lower 64 bits of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTSS2SD,     // [RBX][31:0] FP32->FP64 -> XMM0[63:0]
//
//  XMM2  XMM0  CVTSS2SD,        // XMM2[31:0] FP32->FP64  -> XMM0[63:0]
//
//  XMM2 <-  XMM0  CVTSS2SD,  // XMM0[31:0] FP32->FP64  -> XMM2[63:0]
//
//  XMM0  XMM8  CVTSS2SD,        // XMM0[31:0] FP32->FP64  -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
//
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtss2sdcomma ( VCVTSS2SD, )
//
// C prototype:
//  void dg_forthvcvtss2sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTSS2SD instruction. This sequence converts a single precision
//   floating point value from the lower 32 bits of the source into a double
//   precision floating point value and puts it into the lower 64 bits of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTSS2SD,     // [RBX][31:0] FP32->FP64 -> XMM0[63:0]
//
//  XMM2  XMM0  VCVTSS2SD,        // XMM2[31:0] FP32->FP64  -> XMM0[63:0]
//
//  XMM2 <-  XMM0  VCVTSS2SD,  // XMM0[31:0] FP32->FP64  -> XMM2[63:0]
//
//  XMM0  XMM8  VCVTSS2SD,        // XMM0[31:0] FP32->FP64  -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvtss2sicomma ( CVTSS2SI, )
//
// C prototype:
//  void dg_forthcvtss2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTSS2SI instruction. This opcode sequence gets a single
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer. Rounding uses the MXCSR register setting.
//   Reverse is not supported for CVTSS2SI,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   RCX   CVTSS2SI,      //  [RAX][31:0]  FP32->I64 -> RCX[63:0]
//
//  XMM0  ECX   CVTSS2SI,          //  XMM0[31:0]   FP32->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this compiling
//   word. The source size is ignored and set to the destination size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvtss2sicomma ( VCVTSS2SI, )
//
// C prototype:
//  void dg_forthvcvtss2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTSS2SI instruction. This opcode sequence gets a single
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer. Rounding uses the MXCSR register setting.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   RCX   VCVTSS2SI,  //  [RAX][31:0]  FP32->I64 -> RCX[63:0]
//
//  XMM0  ECX   VCVTSS2SI,      //  XMM0[31:0]   FP32->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this compiling
//   word. The source size is ignored and set to the destination size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvttpd2dqcomma ( CVTTPD2DQ, )
//
// C prototype:
//  void dg_forthcvttpd2sqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTTPD2DQ instruction. This sequence converts 2 double precision
//   floating point values from the source into 2 signed double integer values
//   using truncation and puts the results into the lower 64 bits of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTTPD2DQ,     // [RBX][63:0] FP64->INT32 -> XMM0[31:0]
//                                // [RBX][127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2  XMM0  CVTTPD2DQ,        // XMM2[63:0] FP64->INT32 -> XMM0[31:0]
//                                // XMM2[127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2 <-  XMM0  CVTTPD2DQ,  // XMM0[63:0] FP64->INT32 -> XMM2[31:0]
//                                // XMM0[127:64] FP64->INT32 -> XMM2[63:32]
//
//  XMM0  XMM8  CVTTPD2DQ,        // XMM0[63:0] FP64->INT32 -> XMM8[31:0]
//                                // XMM0[127:64] FP64->INT32 -> XMM8[63:32]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvttpd2dqcomma ( VCVTTPD2DQ, )
//
// C prototype:
//  void dg_forthvcvttpd2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTTPD2DQ instruction. This sequence converts 2 or 4 double precision
//   floating point values from the source into 2 or 4 signed double integer values
//   using truncation and puts the results into the lower 64 bits of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTTPD2DQ,    // [RBX][63:0] FP64->INT32 -> XMM0[31:0]
//                                // [RBX][127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2  XMM0  VCVTTPD2DQ,       // XMM2[63:0] FP64->INT32 -> XMM0[31:0]
//                                // XMM2[127:64] FP64->INT32 -> XMM0[63:32]
//
//  XMM2 <-  XMM0  VCVTTPD2DQ, // XMM0[63:0] FP64->INT32 -> XMM2[31:0]
//                                // XMM0[127:64] FP64->INT32 -> XMM2[63:32]
//
//  YMM0  YMM8  VCVTTPD2DQ,   // YMM0[63:0]    FP64->INT32 -> YMM8[31:0]
//                            // YMM0[127:64]  FP64->INT32 -> YMM8[63:32]
//                            // YMM0[191:128] FP64->INT32 -> YMM8[95:32]
//                            // YMM0[255:192] FP64->INT32 -> YMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvttpd2picomma ( CVTTPD2PI, )
//
// C prototype:
//  void dg_forthcvttpd2picomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTTPD2PI instruction. This opcode sequence converts two double
//   precision floating point values from an xmm register or memory source
//   into two signed 32 bit integers using truncation and puts them into the
//   destination floating point register.
//   Reverse is not supported for CVTPD2PI,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XMM0  CVTTPD2PI,  //  XMM0[63:0]   FP64->I32 -> [RAX][31:0]
//                             //  XMM0[127:64] FP64->I32 -> [RAX][63:32]
//
//  ST1  XMM0  CVTTPD2PI,      //  XMM0[63:0]   FP64->I32 -> ST1[31:0]
//                             //  XMM0[127:64] FP64->I32 -> ST1[63:32]
//
// Note:
//  You do not need to specify a data size for memory target destinations. If you do,
//   it must be 64BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvttps2dqcomma ( CVTTPS2DQ, )
//
// C prototype:
//  void dg_forthcvttps2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTTPS2DQ instruction. This sequence converts 4 single precision
//   floating point values from the source into 4 signed double integer values
//   using truncation and puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  CVTTPS2DQ,     // [RBX][31:0] FP32->INT32 -> XMM0[31:0]
//                                // [RBX][63:32] FP32->INT32 -> XMM0[63:32]
//                                // [RBX][95:64] FP32->INT32 -> XMM0[95:64]
//                                // [RBX][127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2  XMM0  CVTTPS2DQ,        // XMM2[31:0] FP32->INT32 -> XMM0[31:0]
//                                // XMM2[63:32] FP32->INT32 -> XMM0[63:32]
//                                // XMM2[95:64] FP32->INT32 -> XMM0[95:64]
//                                // XMM2[127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2 <-  XMM0  CVTTPS2DQ,  // XMM0[31:0] FP32->INT32 -> XMM2[31:0]
//                                // XMM0[63:32] FP32->INT32 -> XMM2[63:32]
//                                // XMM0[95:64] FP32->INT32 -> XMM2[95:64]
//                                // XMM0[127:96] FP32->INT32 -> XMM2[127:96]
//
//  XMM0  XMM8  CVTTPS2DQ,        // XMM0[31:0] FP32->INT32 -> XMM8[31:0]
//                                // XMM0[63:32] FP32->INT32 -> XMM8[63:32]
//                                // XMM0[95:64] FP32->INT32 -> XMM8[95:64]
//                                // XMM0[127:96] FP32->INT32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvttps2dqcomma ( VCVTTPS2DQ, )
//
// C prototype:
//  void dg_forthvcvttps2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTTPS2DQ instruction. This sequence converts 4 or 8 single 
//   precision floating point values from the source into 4 or 8 signed double 
//   integer values using truncation and puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VCVTTPS2DQ,  // [RBX][31:0] FP32->INT32 -> XMM0[31:0]
//                              // [RBX][63:32] FP32->INT32 -> XMM0[63:32]
//                              // [RBX][95:64] FP32->INT32 -> XMM0[95:64]
//                              // [RBX][127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2  XMM0  VCVTTPS2DQ,     // XMM2[31:0] FP32->INT32 -> XMM0[31:0]
//                              // XMM2[63:32] FP32->INT32 -> XMM0[63:32]
//                              // XMM2[95:64] FP32->INT32 -> XMM0[95:64]
//                              // XMM2[127:96] FP32->INT32 -> XMM0[127:96]
//
//  XMM2 <-  XMM0  VCVTTPS2DQ, // XMM0[31:0] FP32->INT32 -> XMM2[31:0]
//                                // XMM0[63:32] FP32->INT32 -> XMM2[63:32]
//                                // XMM0[95:64] FP32->INT32 -> XMM2[95:64]
//                                // XMM0[127:96] FP32->INT32 -> XMM2[127:96]
//
//  YMM0  YMM8  VCVTTPS2DQ,   // YMM0[31:0]    FP32->INT32 -> YMM8[31:0]
//                            // YMM0[63:32]   FP32->INT32 -> YMM8[63:32]
//                            // YMM0[95:64]   FP32->INT32 -> YMM8[95:64]
//                            // YMM0[127:96]  FP32->INT32 -> YMM8[127:96]
//                            // YMM0[159:128] FP32->INT32 -> YMM8[159:128]
//                            // YMM0[191:160] FP32->INT32 -> YMM8[191:160]
//                            // YMM0[223:192] FP32->INT32 -> YMM8[223:192]
//                            // YMM0[255:224] FP32->INT32 -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvttsd2sicomma ( CVTTSD2SI, )
//
// C prototype:
//  void dg_forthcvttsd2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTTSD2SI instruction. This opcode sequence gets a double
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer using truncation and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer.
//   Reverse is not supported for CVTSD2SI,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  RCX  CVTTSD2SI,  //  [RAX][63:0]  FP64->I64 -> RCX[63:0]
//
//  XMM0  ECX  CVTTSD2SI,     //  XMM0[63:0]   FP64->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this compiling
//   word. The source size is ignored and is set to the destination size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvttsd2sicomma ( VCVTTSD2SI, )
//
// C prototype:
//  void dg_forthvcvttsd2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTTSD2SI instruction. This opcode sequence gets a double
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer using truncation and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  RCX  VCVTTSD2SI, //  [RAX][63:0]  FP64->I64 -> RCX[63:0]
//
//  XMM0  ECX  VCVTTSD2SI,    //  XMM0[63:0]   FP64->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this 
//   compiling word. The source size is ignored and is set to the destination 
//   size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvttss2sicomma ( CVTTSS2SI, )
//
// C prototype:
//  void dg_forthcvttss2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 CVTTSS2SI instruction. This opcode sequence gets a single
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer using truncation and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer.
//   Reverse is not supported for CVTTSS2SI,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  RCX  CVTTSS2SI,  //  [RAX][31:0]  FP32->I64 -> RCX[63:0]
//
//  XMM0  ECX  CVTTSS2SI,     //  XMM0[31:0]   FP32->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this compiling
//   word. The source size is ignored and set to the destination size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvcvttss2sicomma ( VCVTTSS2SI, )
//
// C prototype:
//  void dg_forthvcvttss2sicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   R                            specifies a register target
//                                 R is optional
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VCVTTSS2SI instruction. This opcode sequence gets a single
//   precision floating point value from an xmm register or memory source and
//   converts it into a signed integer using truncation and puts it into the
//   destination register. If the destination register is 32 bits, the floating
//   point value is converted to a signed 32 bit integer. If the destination
//   register is 64 bits, the floating point value is converted to a signed
//   64 bit integer.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  RCX  VCVTTSS2SI, //  [RAX][31:0]  FP32->I64 -> RCX[63:0]
//
//  XMM0  ECX  VCVTTSS2SI,    //  XMM0[31:0]   FP32->I32 -> RCX[31:0]
//
// Note:
//  Only 32 bit and 64 bit destination registers are supported for this 
//   compiling word. The source size is ignored and set to the destination size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcwdcomma ( CWD, AX->DX:AX, )
//
// C prototype:
//  void dg_forthcwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 CWD instruction.
//  This instructions sign extends AX into DX:AX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AX->DX:AX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcwdecomma ( CWDE, AX->EAX, )
//
// C prototype:
//  void dg_forthcwdecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 CWDE instruction.
//  This instructions sign extends AX into EAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AX->EAX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdaacomma ( DAA, )
//
// C prototype:
//  void dg_forthdaacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 DAA instruction. This is a decimal
//   adjust register AL after addition instruction which is used with binary
//   coded decimal.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  DAA,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdascomma ( DAS, )
//
// C prototype:
//  void dg_forthdascomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 DAS instruction. This is a decimal
//   adjust register AL after subtraction instruction which is used with binary
//   coded decimal.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  DAS,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdeccomma ( DEC, )
//
// C prototype:
//  void dg_forthdeccomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 DEC instruction.
//    This opcode sequence does:
//      target <- target - 1
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EAX  DEC,                      // decrements EAX
//  CX  DEC,                       // decrements CX
//  EDX [R]  32BIT DEC,            // decrements the 32 bit value at the
//                                 //  the memory location specified by EDX,
//                                 //  size specifier required
//  R9 DEC,                        // decrements R9
//                                 //  64 bit address mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdivcomma ( DIV, )
//
// C prototype:
//  void dg_forthdivcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 DIV instruction, which is an unsigned divide.
//    This opcode sequence does different actions based on the DATASIZE of the
//    instruction:
//     if size is 8BIT then AL remainder AH <- AX / target
//     if size is 16BIT then AX remainder DX <- DX:AX / target
//     if size is 32BIT then EAX remainder EDX <- EDX:EAX / target
//     if size is 64BIT then RAX remainder RDX <- RDX:RAX / target
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdivpdcomma ( DIVPD, )
//
// C prototype:
//  void dg_forthdivpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 DIVPD instruction. This sequence divides each double precision
//   floating point value in the destination by the corresponding double
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  DIVPD,     // XMM0[63:0] / [RBX][63:0] -> XMM0[63:0]
//                            // XMM0[127:64] / [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM0  DIVPD,        // XMM0[63:0] / XMM2[63:0] -> XMM0[63:0]
//                            // XMM0[127:64] / XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM0  DIVPD,  // XMM2[63:0] / XMM0[63:0] -> XMM2[63:0]
//                            // XMM2[127:64] / XMM0[127:64] -> XMM2[127:64]
//
//  XMM0  XMM8  DIVPD,        // XMM8[63:0] / XMM0[63:0] -> XMM8[63:0]
//                            // XMM8[127:64] / XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvdivpdcomma ( VDIVPD, )
//
// C prototype:
//  void dg_forthvdivpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VDIVPD instruction. This sequence divides each double precision
//   floating point value in target y by the corresponding double
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VDIVPD,  // XMM1[63:0]   / [RBX][63:0]   -> XMM0[63:0]
//                                // XMM1[127:64] / [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VDIVPD,     // XMM1[63:0]   / XMM2[63:0]   -> XMM0[63:0]
//                                // XMM1[127:64] / XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM1  XMM0  VDIVPD, // XMM1[63:0]   / XMM0[63:0]   -> XMM2[63:0]
//                                  // XMM1[127:64] / XMM0[127:64] -> XMM2[127:64]
//
//  XMM0  XMM1  XMM8  VDIVPD, // XMM1[63:0]   / XMM0[63:0]   -> XMM8[63:0]
//                            // XMM1[127:64] / XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdivpscomma ( DIVPS, )
//
// C prototype:
//  void dg_forthdivpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 DIVPS instruction. This sequence divides each single precision
//   floating point value in the destination by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  DIVPS,     // XMM0[31:0] / [RBX][63:0] -> XMM0[31:0]
//                            // XMM0[63:32] / [RBX][63:32] -> XMM0[63:32]
//                            // XMM0[95:64] / [RBX][95:64] -> XMM0[95:64]
//                            // XMM0[127:96] / [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  DIVPS,        // XMM0[31:0] / XMM2[31:0] -> XMM0[31:0]
//                            // XMM0[63:32] / XMM2[63:32] -> XMM0[63:32]
//                            // XMM0[95:64] / XMM2[95:64] -> XMM0[95:64]
//                            // XMM0[127:96] / XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  DIVPS,  // XMM2[31:0] / XMM0[31:0] -> XMM2[31:0]
//                            // XMM2[63:32] / XMM0[63:32] -> XMM2[63:32]
//                            // XMM2[95:64] / XMM0[95:64] -> XMM2[95:64]
//                            // XMM2[127:96] / XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM8  DIVPS,        // XMM8[31:0] / XMM0[31:0] -> XMM8[31:0]
//                            // XMM8[63:32] / XMM0[63:32] -> XMM8[63:32]
//                            // XMM8[95:64] / XMM0[95:64] -> XMM8[95:64]
//                            // XMM8[127:96] / XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvdivpscomma ( VDIVPS, )
//
// C prototype:
//  void dg_forthvdivpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VDIVPS instruction. This sequence divides each single precision
//   floating point value in target y by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VDIVPS, // XMM1[31:0]   / [RBX][63:0]   -> XMM0[31:0]
//                               // XMM1[63:32]  / [RBX][63:32]  -> XMM0[63:32]
//                               // XMM1[95:64]  / [RBX][95:64]  -> XMM0[95:64]
//                               // XMM1[127:96] / [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VDIVPS, // XMM1[31:0]   / XMM2[31:0]   -> XMM0[31:0]
//                            // XMM1[63:32]  / XMM2[63:32]  -> XMM0[63:32]
//                            // XMM1[95:64]  / XMM2[95:64]  -> XMM0[95:64]
//                            // XMM1[127:96] / XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VDIVPS, // XMM1[31:0] / XMM0[31:0] -> XMM2[31:0]
//                            // XMM1[63:32]  / XMM0[63:32]  -> XMM2[63:32]
//                            // XMM1[95:64]  / XMM0[95:64]  -> XMM2[95:64]
//                            // XMM1[127:96] / XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM1  XMM8  VDIVPS, // XMM1[31:0]   / XMM0[31:0]   -> XMM8[31:0]
//                            // XMM1[63:32]  / XMM0[63:32]  -> XMM8[63:32]
//                            // XMM1[95:64]  / XMM0[95:64]  -> XMM8[95:64]
//                            // XMM1[127:96] / XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdivsdcomma ( DIVSD, )
//
// C prototype:
//  void dg_forthdivsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 DIVSD instruction. This sequence divides the double precision
//   floating point value in the lower 64 bits of the destination by the double
//   precision floating point value in the lower 64 bits of the source and
//   stores the result to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  DIVSD,     // XMM0[63:0] / [RBX][63:0] -> XMM0[63:0]
//
//  XMM2  XMM0  DIVSD,        // XMM0[63:0] / XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM0  DIVSD,  // XMM2[63:0] / XMM0[63:0] -> XMM2[63:0]
//
//  XMM0  XMM8  DIVSD,        // XMM8[63:0] / XMM0[63:0] -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvdivsdcomma ( VDIVSD, )
//
// C prototype:
//  void dg_forthvdivsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 DIVSD instruction. This sequence divides the double precision
//   floating point value in the lower 64 bits of target y by the double
//   precision floating point value in the lower 64 bits of the source and
//   stores the result to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VDIVSD,    // XMM1[63:0] / [RBX][63:0] -> XMM0[63:0]
//
//  XMM2  XMM1  XMM0  VDIVSD,       // XMM1[63:0] / XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM1  XMM0  VDIVSD, // XMM1[63:0] / XMM0[63:0] -> XMM2[63:0]
//
//  XMM0  XMM1  XMM8  VDIVSD,       // XMM1[63:0] / XMM0[63:0] -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdivsscomma ( DIVSS, )
//
// C prototype:
//  void dg_forthdivsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 DIVSS instruction. This sequence divides each single precision
//   floating point value in the destination by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  DIVSS,     // XMM0[31:0] / [RBX][31:0] -> XMM0[31:0]
//
//  XMM2  XMM0  DIVSS,        // XMM0[31:0] / XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM0  DIVSS,  // XMM2[31:0] / XMM0[31:0] -> XMM2[31:0]
//
//  XMM0  XMM8  DIVSS,        // XMM8[31:0] / XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvdivsscomma ( VDIVSS, )
//
// C prototype:
//  void dg_forthvdivsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VDIVSS instruction. This sequence divides each single precision
//   floating point value in the destination by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VDIVSS,    // XMM1[31:0] / [RBX][31:0] -> XMM0[31:0]
//
//  XMM2  XMM1  XMM0  VDIVSS,       // XMM1[31:0] / XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM1  XMM0  VDIVSS, // XMM1[31:0] / XMM0[31:0] -> XMM2[31:0]
//
//  XMM0  XMM1  XMM8  VDIVSS,       // XMM1[31:0] / XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdpdcomma ( DPPD, )
//
// C prototype:
//  void dg_fortdppdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 DPPD instruction. This opcode sequence does this:
//   destination[63:0] =
//    immediate[0] * ( (immediate[4]*(destination[63:0]   * source[63:0]) +
//                     (immediate[5]*(destination[127:64] * source[127:64]) )
//   destination[127:64] =
//    immediate[1] * ( (immediate[4]*(destination[63:0]   * source[63:0]) +
//                     (immediate[5]*(destination[127:64] * source[127:64]) )
//
//   What this means is, if the bit in the immediate target is 0, it clears
//    part of the calculation. If the bit is 1, it lets the result through.
//   If bits 0, 1, 4, and 5 are set, each double precision floating point value
//    in the destination is multiplied by the corresponding double precision
//    floating point value in the source, then the two results are added
//    together and a copy is put into both the high and low 64 bits of the
//    destination.
//   Bits 4 and 5 select which half of the destination gets a copy of the
//    final result. Bits 0 and 1 select which multiplication gets added
//    into the final result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  DPPD,   // 0 -> XMM0[63:0]
//                               // 0 -> XMM0[127:64]
//
//  11 N  RBX [R]  XMM0  DPPD,   // [RBX][63:0] * XMM0[63:0] -> XMM0[63:0]
//                               // 0 -> XMM0[127:64]
//
//  22 N  RBX [R]  XMM0  DPPD,   // 0 -> XMM0[63:0]
//                               // [RBX][127:64] * XMM0[127:64] -> XMM0[127:64]
//
//  33 N  RBX [R]  XMM0  DPPD,   // ( ([RBX][63:0] * XMM0[63:0]) +
//                               //   ([RBX][127:64] * XMM0[127:64]) )
//                               //   -> XMM0[63:0]
//                               // ( ([RBX][63:0] * XMM0[63:0]) +
//                               //   ([RBX][127:64] * XMM0[127:64]) )
//                               //   -> XMM0[127:64]
//
//  22 N  XMM2  XMM0  DPPD,      // 0 -> XMM0[63:0]
//                               // XMM2[127:64] * XMM0[127:64] -> XMM0[127:64]
//
//  22 N  XMM2 <- XMM0  DPPD, // 0 -> XMM2[63:0]
//                               // XMM2[127:64] * XMM0[127:64] -> XMM2[127:64]
//
//  22 N  XMM0  XMM8 DPPD,       // 0 -> XMM8[63:0]
//                               // XMM8[127:64] * XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvdppdcomma ( VDPPD, )
//
// C prototype:
//  void dg_forthvdppdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VDPPD instruction. This opcode sequence does this:
//   destination[63:0] =
//    immediate[0] * ( (immediate[4]*(targety[63:0]   * source[63:0]) +
//                     (immediate[5]*(targety[127:64] * source[127:64]) )
//   destination[127:64] =
//    immediate[1] * ( (immediate[4]*(targety[63:0]   * source[63:0]) +
//                     (immediate[5]*(targety[127:64] * source[127:64]) )
//
//   What this means is, if the bit in the immediate target is 0, it clears
//    part of the calculation. If the bit is 1, it lets the result through.
//   If bits 0, 1, 4, and 5 are set, each double precision floating point value
//    in the destination is multiplied by the corresponding double precision
//    floating point value in the source, then the two results are added
//    together and a copy is put into both the high and low 64 bits of the
//    destination.
//   Bits 4 and 5 select which half of the destination gets a copy of the
//    final result. Bits 0 and 1 select which multiplication gets added
//    into the final result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VDPPD, // 0 -> XMM0[63:0]
//                                    // 0 -> XMM0[127:64]
//
//  11 N  RBX [R]  XMM1  XMM0  VDPPD, // [RBX][63:0] * XMM1[63:0] -> XMM0[63:0]
//                                    // 0 -> XMM0[127:64]
//
//  22 N  RBX [R]  XMM1  XMM0  VDPPD, // 0 -> XMM0[63:0]
//                               // [RBX][127:64] * XMM1[127:64] -> XMM0[127:64]
//
//  33 N  RBX [R]  XMM1  XMM0  VDPPD, // ( ([RBX][63:0] * XMM1[63:0]) +
//                                    //   ([RBX][127:64] * XMM1[127:64]) )
//                                    //   -> XMM0[63:0]
//                                    // ( ([RBX][63:0] * XMM1[63:0]) +
//                                    //   ([RBX][127:64] * XMM1[127:64]) )
//                                    //   -> XMM0[127:64]
//
//  22 N  XMM2  XMM1  XMM0  VDPPD,    // 0 -> XMM0[63:0]
//                               // XMM2[127:64] * XMM1[127:64] -> XMM0[127:64]
//
//  22 N  XMM2 <- XMM1  XMM0  VDPPD, // 0 -> XMM2[63:0]
//                               // XMM2[127:64] * XMM1[127:64] -> XMM2[127:64]
//
//  22 N  XMM0  XMM1  XMM8 VDPPD,       // 0 -> XMM8[63:0]
//                               // XMM8[127:64] * XMM1[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdppscomma ( DPPS, )
//
// C prototype:
//  void dg_forthdppscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these target y can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 DPPS instruction. This opcode sequence does this:
//   destination[31:0] =
//    immediate[0] * ( (immediate[4]*(destination[31:0]  * source[31:0]) +
//                     (immediate[5]*(destination[63:32] * source[63:32]) +
//                     (immediate[5]*(destination[95:64] * source[95:64]) +
//                     (immediate[5]*(destination[127:96] * source[127:96]) )
//   destination[63:32] =
//    immediate[1] * ( (immediate[4]*(destination[31:0]  * source[31:0]) +
//                     (immediate[5]*(destination[63:32] * source[63:32]) +
//                     (immediate[6]*(destination[95:64] * source[95:64]) +
//                     (immediate[7]*(destination[127:96] * source[127:96]) )
//   destination[95:64] =
//    immediate[2] * ( (immediate[4]*(destination[31:0]  * source[31:0]) +
//                     (immediate[5]*(destination[63:32] * source[63:32]) +
//                     (immediate[6]*(destination[95:64] * source[95:64]) +
//                     (immediate[7]*(destination[127:96] * source[127:96]) )
//   destination[127:96] =
//    immediate[3] * ( (immediate[4]*(destination[31:0]  * source[31:0]) +
//                     (immediate[5]*(destination[63:32] * source[63:32]) +
//                     (immediate[6]*(destination[95:64] * source[95:64]) +
//                     (immediate[7]*(destination[127:96] * source[127:96]) )
//
//   What this means is, if the bit in the immediate target is 0, it clears
//    part of the calculation. If the bit is 1, it lets the result through.
//   If bits 0, 1, 2, 3, 4, 5, 6, and 7 are set, each single precision floating
//    point value in the destination is multiplied by the corresponding single
//    precision floating point value in the source, then the four results are
//    added together and a copy is put into all four 32 bits values of
//    the destination.
//   Bits 4, 5, 6, and 7 select which quarter of the destination gets a copy
//    of the final result. Bits 0, 1, 2, and 3 select which multiplication
//    gets added into the final result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  DPPS,   // 0 -> XMM0[31:0]
//                               // 0 -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  11 N  RBX [R]  XMM0  DPPS,   // [RBX][31:0] * XMM0[31:0] -> XMM0[31:0]
//                               // 0 -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  22 N  RBX [R]  XMM0  DPPS,   // 0 -> XMM0[31:0]
//                               // [RBX][63:32] * XMM0[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  33 N  RBX [R]  XMM0  DPPS,   // ( ([RBX][31:0] * XMM0[31:0]) +
//                               //   ([RBX][63:32] * XMM0[63:32]) )
//                               //   -> XMM0[31:0]
//                               // ( ([RBX][31:0] * XMM0[31:0]) +
//                               //   ([RBX][63:32] * XMM0[63:32]) )
//                               //   -> XMM0[63:32]
//
//  02 N  XMM2  XMM0  DPPS,      // 0 -> XMM0[31:0]
//                               // XMM2[63:32] * XMM0[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM2  DPPS, // 0 -> XMM0[31:0]
//                               // XMM2[63:32] * XMM0[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  02 N  XMM8  XMM0 DPPS,       // 0 -> XMM0[31:0]
//                               // XMM8[63:32] * XMM0[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvdppscomma ( VDPPS, )
//
// C prototype:
//  void dg_forthvdppscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these target y can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targets x and z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VDPPS instruction. This opcode sequence does this:
//   destination[31:0] =
//    immediate[0] * ( (immediate[4]*(targety[31:0]  * source[31:0]) +
//                     (immediate[5]*(targety[63:32] * source[63:32]) +
//                     (immediate[5]*(targety[95:64] * source[95:64]) +
//                     (immediate[5]*(targety[127:96] * source[127:96]) )
//   destination[63:32] =
//    immediate[1] * ( (immediate[4]*(targety[31:0]  * source[31:0]) +
//                     (immediate[5]*(targety[63:32] * source[63:32]) +
//                     (immediate[6]*(targety[95:64] * source[95:64]) +
//                     (immediate[7]*(targety[127:96] * source[127:96]) )
//   destination[95:64] =
//    immediate[2] * ( (immediate[4]*(targety[31:0]  * source[31:0]) +
//                     (immediate[5]*(targety[63:32] * source[63:32]) +
//                     (immediate[6]*(targety[95:64] * source[95:64]) +
//                     (immediate[7]*(targety[127:96] * source[127:96]) )
//   destination[127:96] =
//    immediate[3] * ( (immediate[4]*(targety[31:0]  * source[31:0]) +
//                     (immediate[5]*(targety[63:32] * source[63:32]) +
//                     (immediate[6]*(targety[95:64] * source[95:64]) +
//                     (immediate[7]*(targety[127:96] * source[127:96]) )
//
//   What this means is, if the bit in the immediate target is 0, it clears
//    part of the calculation. If the bit is 1, it lets the result through.
//   If bits 0, 1, 2, 3, 4, 5, 6, and 7 are set, each single precision floating
//    point value in target y is multiplied by the corresponding single
//    precision floating point value in the source, then the four results are
//    added together and a copy is put into all four 32 bits values of
//    the destination.
//   Bits 4, 5, 6, and 7 select which quarter of the destination gets a copy
//    of the final result. Bits 0, 1, 2, and 3 select which multiplication
//    gets added into the final result.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VDPPS, // 0 -> XMM0[31:0]
//                                    // 0 -> XMM0[63:32]
//                                    // 0 -> XMM0[95:64]
//                                    // 0 -> XMM0[127:96]
//
//  11 N  RBX [R]  XMM1  XMM0  VDPPS, // [RBX][31:0] * XMM1[31:0] -> XMM0[31:0]
//                               // 0 -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  22 N  RBX [R]  XMM1  XMM0  VDPPS, // 0 -> XMM0[31:0]
//                               // [RBX][63:32] * XMM1[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  33 N  RBX [R]  XMM1  XMM0  VDPPS, // ( ([RBX][31:0] * XMM1[31:0]) +
//                                    //   ([RBX][63:32] * XMM1[63:32]) )
//                                    //   -> XMM0[31:0]
//                                    // ( ([RBX][31:0] * XMM1[31:0]) +
//                                    //   ([RBX][63:32] * XMM1[63:32]) )
//                                    //   -> XMM0[63:32]
//
//  02 N  XMM2  XMM1  XMM0  VDPPS,  // 0 -> XMM0[31:0]
//                               // XMM2[63:32] * XMM1[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM1  XMM2  VDPPS, // 0 -> XMM0[31:0]
//                               // XMM2[63:32] * XMM1[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
//  02 N  XMM8  XMM1  XMM0 VDPPS,  // 0 -> XMM0[31:0]
//                               // XMM8[63:32] * XMM1[63:32] -> XMM0[63:32]
//                               // 0 -> XMM0[95:64]
//                               // 0 -> XMM0[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthemmscomma ( EMMS, )
//
// C prototype:
//  void dg_forthemmscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 EMMS instruction. This marks the
//   floating point registers as available for floating point instruction use.
//   If an MMX instruction uses a floating point register, it marks it as
//   not available for floating point use, this instruction puts it back.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  EMMS,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthentercomma ( ENTER, )
//
// C prototype:
//  void dg_forthentercomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( immediatetargetxparameterlist immediatetargetyparameterlist -- )
//
//  immediatetargetxparameterlist can be one of:
//  immediatetargetyparameterlists can be one of:
//   ( immediatevalue N -- )
//   ( immediatevalue  minimumimmediatesize  IMMEDIATE -- )
//
//   immediatevalue                 Value of an immediate target.
//   minimumimmediatesize           minimum size of immediate target and is
//                                   ignored for ENTER,
//   IMMEDIATE                      specifies an immediate target. The value for
//                                   this target is encoded into the opcode
//                                   sequence.
//   N                              specifies an immediate target with a minimum
//                                   value size of 0
//
//   immediatetargetxparameterlist  Nesting level 0 - 31
//                                   0 means RBP PUSH, RSP RBP MOV,
//   immediatetargetyparameterlists Number of bytes to subtract from RSP to
//                                   allocate local memory on the return stack
//                                   after doing the nesting level stuff.
//                                   This number needs to be a multiple of the
//                                   current address mode size. In 64 bit mode
//                                   this needs to be a multiple of 8.
//                          
// Execute state action:
//  Pulls both immediate targets from data stack then compiles the opcode
//   string for the x86 ENTER nnnn, nn instruction where
//   nn is the lower 8 bits of the immediate x target and
//   nnnn is the lower 16 bits of the immediate y target.
//   See processor docs for details. 
//   If the x target value is 0 N then this is equivalent to:
//    RBP PUSH,
//    RSP  RBP  MOV,
//    y N  RSP SUB,
//    
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthenterrbpframecomma ( ENTER-RBP-FRAME, ENTER-FRAME, )
//
// C prototype:
//  void dg_forthenterrbpframecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//                                                          
// Action:
//  Compiles code to set up an RBP stack frame.
//  The compiled code is this:
//   RBP PUSH,
//   RSP RBP MOV,
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthextractpscomma ( EXTRACTPS, )
//
// C prototype:
//  void dg_forthextractpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//
//   R                            specifies a register target
//                                 R is optional
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 EXTRACTPS instruction. This opcode sequence gets one of four
//   32 bit values from the source xmm register and puts it into
//   the destination register or memory target. Intel docs says this
//   opcode sequence operates on single precision floating point values but
//   any 32 bit values will work.
//   The lower two bits of the immediate value determine which of the four
//   32 bit values are moved. An immediate value of 0 moves the lowest of the
//   four 32 bit values.
//   The data size for this instruction is always 32 bits. If you specify a
//   data size for a memory target it must be 32BIT. The register target can
//   be a 32 or 64 bit register, but will treat either choice like a 32 bit
//   register. Since writing to a 32 bit register in 64 bit
//   address mode clears the upper 32 bits of the corresponding 64 bit register,
//   those upper 32 bits should be cleared if you are in 64 bit address mode.
//   Reverse is not supported for EXTRACTPS,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  0 N  XMM0  RAX [R]  EXTRACTPS,  //  XMM0[31:0]   ->  EAX[31:0]
//                                  //   (in 64 bit mode upper bits 32 of RAX
//                                       are cleared, but you knew that ;-)
//
//  1 N  XMM0  EAX [R]  EXTRACTPS,  //  XMM0[63:32]  ->  EAX[31:0]
//                                  //   (in 64 bit mode upper bits 32 of RAX
//                                       are cleared, but you knew that ;-)
//
//  2 N  XMM0  RAX [R]  EXTRACTPS,  //  XMM0[95:64]  ->  EAX[31:0]
//                                  //   (in 64 bit mode upper bits 32 of RAX
//                                       are cleared, but you knew that ;-)
//
//  3 N  XMM0  RAX [R]  EXTRACTPS,  //  XMM0[127:96] ->  EAX[31:0]
//                                  //   (in 64 bit mode upper bits 32 of RAX
//                                       are cleared, but you knew that ;-)
//
//
//  2 N  XMM0  ECX  EXTRACTPS,      //  XMM0[95:64]  ->  ECX[31:0]
//                                  //   (in 64 bit mode upper bits 32 of RCX
//                                       are cleared, but you knew that ;-)
//
// Note:
//  Register size is ignored, the compiler will use the 32 bit equivalent
//   register unless you specify a 64 bit register using R. If you use R
//   with a 64 bit register you will get the rex.w prefix. If it works,
//   the behavior of the opcode sequence should be the same as without rex.w
//   the upper 32 bits are cleared in either case.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvextractpscomma ( VEXTRACTPS, )
//
// C prototype:
//  void dg_forthvextractpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//
//   R                            specifies a register target
//                                 R is optional
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VEXTRACTPS instruction. This opcode sequence gets one of four
//   32 bit values from the source xmm register and puts it into
//   the destination register or memory target. Intel docs says this
//   opcode sequence operates on single precision floating point values but
//   any 32 bit values will work.
//   The lower two bits of the immediate value determine which of the four
//   32 bit values are moved. An immediate value of 0 moves the lowest of the
//   four 32 bit values.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  0 N  XMM0  RAX [R]  VEXTRACTPS,  //  XMM0[31:0]   ->  RAX[31:0]
//                                   //  0            ->  RAX[63:32]
//
//  1 N  XMM0  EAX [R]  VEXTRACTPS,  //  XMM0[63:32]  ->  EAX[31:0]
//                                   //  0            ->  RAX[63:32]
//
//  2 N  XMM0  RAX [R]  VEXTRACTPS,  //  XMM0[95:64]  ->  EAX[31:0]
//                                   //  0            ->  RAX[63:32]
//
//  3 N  XMM0  RAX [R]  VEXTRACTPS,  //  XMM0[127:96] ->  EAX[31:0]
//                                   //  0            ->  RAX[63:32]
//
//
//  2 N  XMM0  ECX  VEXTRACTPS,      //  XMM0[95:64]  ->  ECX[31:0]
//                                   //  0            ->  RCX[63:32]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthf2xm1comma ( F2XM1, )
//
// C prototype:
//  void dg_forthf2xm1comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with (2 ^ (ST0)) - 1.
//   ST0 in must be between -1 and 1.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfabscomma ( FABS, )
//
// C prototype:
//  void dg_forthfabscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with it's absoluate value.
//   This means the sign bit in ST0 is cleared.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfaddtost0comma ( FADD, FADD->ST0, )
//
// C prototype:
//  void dg_forthfaddtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that adds the floating point value in the target to ST0,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfaddst0tocomma ( FADDST0->, )
//
// C prototype:
//  void dg_forthfaddst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that adds ST0 to value in the target floating point register,
//   then stores the result to the target floating point register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfaddpst0tocomma ( FADDPST0->, )
//
// C prototype:
//  void dg_forthfaddpst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that adds ST0 to value in the target floating point register,
//   then stores the result to the target floating point register.
//   Then the top value on the floating point stack is dropped.
//   If ST0 was empty before the compiled code executes, then the #IS exception
//    is generated.
//   When a floating point stack underflow occurred, the C1 register is set.
//   I wonder what happens if the source floating point register is empty?
//   Is C1 only set when the #IS exception is masked?
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfiaddtost0comma ( FIADD, FIADD->ST0, )
//
// C prototype:
//  void dg_forthfiaddtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that adds the integer value in the target to ST0,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfbldcomma ( FBLD, )
//
// C prototype:
//  void dg_forthfbldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( memorytargetparameterlist -- )
//
// Data stack in:
//
//  memorytargetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that converts a 10 byte bcd encoded signed integer
//   from a memory target to floating point and pushes it onto the hardware
//   floating point stack. The bcd value is stored in memory little endian,
//   with 18 bcd digits in 9 bytes followed by a byte to hold the
//   sign bit. The sign bit is stored in bit 7 of the sign byte.
//  Data size, and direction specifiers are ignored for this instruction.
//  After the compiled code is executed, ST0 holds the copy of the value if
//   the value was a bcd valid integer.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfbstpcomma ( FBSTP, )
//
// C prototype:
//  void dg_forthfbstpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( memorytargetparameterlist -- )
//
// Data stack in:
//
//  memorytargetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that pops a floating point value off the hardware floating point
//   stack, converts it to a 10 byte bcd encoded signed integer, and stores it
//   to a memory target. The bcd value is stored in memory little endian,
//   with 18 bcd digits in 9 bytes followed by a byte to hold the
//   sign bit. The sign bit is stored in bit 7 of the sign byte.
//  Data size, and direction specifiers are ignored for this instruction.
//  Before the compiled code is executed, ST0 holds the value to be converted
//   to a bcd integer.
//  According to the processor docs, tf the floating point value is not an integer,
//   it is rounded to the nearest integer according to the current rounding mode.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfchscomma ( FCHS, )
//
// C prototype:
//  void dg_forthfchscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  The sign bit in ST0 is toggled. This is the same as multiplying the value
//   in ST0 by -1.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfclexcomma ( FCLEX, )
//
// C prototype:
//  void dg_forthfclexcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Checks for and handles unmasked floating point exceptions ( WAIT, )
//   then clears the floating point exception flags.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfnclexcomma ( FNCLEX, )
//
// C prototype:
//  void dg_forthfnclexcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Clears the floating point exception flags.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovbcomma ( FCMOVB, FCMOVBST0->, )
//
// C prototype:
//  void dg_forthfcmovbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags carry flag is set.
//  When using the FCOMI, FCOMIP, FUCOMI, FUCOMIP, instructions, the carry flag
//   is set if the destination is below the source.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovbecomma ( FCMOVBE, FCMOVBEST0->, )
//
// C prototype:
//  void dg_forthfcmovbecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags carry and/or zero flag is set.
//  When using the FCOMI, FCOMIP, FUCOMI, FUCOMIP, instructions, the carry flag
//   is set if the destination is below the source and the zero flag is set if
//   the source and destination are equal.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovecomma ( FCMOVE, FCMOVEST0->, )
//
// C prototype:
//  void dg_forthfcmovecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags zero flag is set.
//  When using the FCOMI, FCOMIP, FUCOMI, FUCOMIP, instructions,
//   the zero flag is set if the source and destination are equal.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovucomma ( FCMOVU, FCMOVUST0->, )
//
// C prototype:
//  void dg_forthfcmovucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags parity flag is set.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovnbcomma ( FCMOVNB, FCMOVNBST0->, )
//
// C prototype:
//  void dg_forthfcmovnbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags carry flag is clear.
//  When using the FCOMI, FCOMIP, FUCOMI, FUCOMIP, instructions, the carry flag
//   is clear if the destination is above or equal to the source.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovnbecomma ( FCMOVNBE, FCMOVNBEST0->, )
//
// C prototype:
//  void dg_forthfcmovnbecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags carry and zero flags are clear.
//  When using the FCOMI, FCOMIP, FUCOMI, FUCOMIP, instructions, the carry and
//   zero flags are clear if the destination is above the source.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovnecomma ( FCMOVNE, FCMOVNEST0->, )
//
// C prototype:
//  void dg_forthfcmovnecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags zero flag is clear.
//  When using the FCOMI, FCOMIP, FUCOMI, FUCOMIP, instructions,
//   the zero flag is clear if the source and destination are not equal.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcmovnucomma ( FCMOVNU, FCMOVNUST0->, )
//
// C prototype:
//  void dg_forthfcmovnucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that will move a floating point value from ST0 to the target
//   floating point register if the eflags parity flag is clear.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcomtost0comma ( FCOM, FCOM->ST0, )
//
// C prototype:
//  void dg_forthfcomtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere.
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format. WHen C2 is set, C0 and C3 are also set.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcomptost0comma ( FCOMP, FCOMP->ST0, )
//
// C prototype:
//  void dg_forthfcomptost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere. Then the top value is dropped from the floating point
//   stack. (Dropped means the top value is popped but not stored.)
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format. WHen C2 is set, C0 and C3 are also set.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcomppcomma ( FCOMPP, )
//
// C prototype:
//  void dg_forthfcomppcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that compares ST0 with the value in ST1. This is the
//   same as subtracting ST1 from ST0 but not storing the
//   result anywhere. Then the top two values are dropped from the floating point
//   stack. (Dropped means the values are popped but not stored.)
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format. WHen C2 is set, C0 and C3 are also set.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcomitost0comma ( FCOMI, FCOMI->ST0, )
//
// C prototype:
//  void dg_forthfcomitost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere.
//  Status flags in the processor EFLAGS register are changed according
//   to the result of the compare.
//   the carry flag is set if ST0 >= target's value, cleared otherwise.
//   the zero flag is set if ST0 = target's value, cleared otherwise.
//   the parity flag is set if the compare could not be done.
//    (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format.
//    When the parity flag is set, the zero and carry flags are also set.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcomiptost0comma ( FCOMIP, FCOMIP->ST0, )
//
// C prototype:
//  void dg_forthfcomiptost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere. Then the top value is dropped from the floating point
//   stack. (Dropped means the top value is popped but not stored.)
//  Status flags in the processor EFLAGS register are changed according
//   to the result of the compare.
//   the carry flag is set if ST0 >= target's value, cleared otherwise.
//   the zero flag is set if ST0 = target's value, cleared otherwise.
//   the parity flag is set if the compare could not be done.
//    (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format.
//    When the parity flag is set, the zero and carry flags are also set.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfucomitost0comma ( FUCOMI, FUCOMI->ST0, )
//
// C prototype:
//  void dg_forthfucomitost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere.
//  Status flags in the processor EFLAGS register are changed according
//   to the result of the compare.
//   the carry flag is set if ST0 >= target's value, cleared otherwise.
//   the zero flag is set if ST0 = target's value, cleared otherwise.
//   the parity flag is set if the compare could not be done.
//    (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format.
//    When the parity flag is set, the zero and carry flags are also set.
//   In this instruction, the invalid arithmetic exception is only generated
//    when one or both of the values is in an unsupported format. If a value
//    is not a number, an exception is not generated.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfucomiptost0comma ( FUCOMIP, FUCOMIP->ST0, )
//
// C prototype:
//  void dg_forthfucomiptost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere. Then the top value is dropped from the floating point
//   stack. (Dropped means the top value is popped but not stored.)
//  Status flags in the processor EFLAGS register are changed according
//   to the result of the compare.
//   the carry flag is set if ST0 >= target's value, cleared otherwise.
//   the zero flag is set if ST0 = target's value, cleared otherwise.
//   the parity flag is set if the compare could not be done.
//    (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format.
//    When the parity flag is set, the zero and carry flags are also set.
//   In this instruction, the invalid arithmetic exception is only generated
//    when one or both of the values is in an unsupported format. If a value
//    is not a number, an exception is not generated.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfcoscomma ( FCOS, )
//
// C prototype:
//  void dg_forthfcoscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with it's cosine.
//  The value is in radians.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdecstpcomma ( FDECSTP, )
//
// C prototype:
//  void dg_forthfdecstpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that decrements the floating point stack pointer.
//  The floating point stack pointer is a 3 bit value (0-7).
//  Nothing else is affected.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdivtost0comma ( FDIV, FDIV->ST0, )
//
// C prototype:
//  void dg_forthfdivtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides ST0 by the floating point value in the target,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdivst0tocomma ( FDIVST0->, )
//
// C prototype:
//  void dg_forthfdivst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides the value in the target floating point register
//   by ST0, then stores the result to the target floating point register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdivpst0tocomma ( FDIVPST0->, )
//
// C prototype:
//  void dg_forthfdivpst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides the value in the target floating point register
//   by ST0, then stores the result to the target floating point register.
//   Then the top value on the floating point stack is dropped.
//   If ST0 was empty before the compiled code executes, then the #IS exception
//    is generated.
//   When a floating point stack underflow occurred, the C1 register is set.
//   I wonder what happens if the source floating point register is empty?
//   Is C1 only set when the #IS exception is masked?
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfidivtost0comma ( FIDIV, FIDIV->ST0, )
//
// C prototype:
//  void dg_forthfidivtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides ST0 by the integer value in the target,
//   then stores the result to ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdivrtost0comma ( FDIVR, FDIVR->ST0, )
//
// C prototype:
//  void dg_forthfdivrtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides the floating point value in the target by ST0,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdivrst0tocomma ( FDIVRST0->, )
//
// C prototype:
//  void dg_forthfdivrst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides ST0 by the value in the target floating point
//   register, then stores the result to the target floating point register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdivrpst0tocomma ( FDIVRPST0->, )
//
// C prototype:
//  void dg_forthfdivrpst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides ST0 by the value in the target floating point
//   register, then stores the result to the target floating point register.
//   Then the top value on the floating point stack is dropped.
//   If ST0 was empty before the compiled code executes, then the #IS exception
//    is generated.
//   When a floating point stack underflow occurred, the C1 register is set.
//   I wonder what happens if the source floating point register is empty?
//   Is C1 only set when the #IS exception is masked?
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfidivrtost0comma ( FIDIVR, FIDIVR->ST0, )
//
// C prototype:
//  void dg_forthfidivrtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that divides the integer value in the target by ST0,
//   then stores the result to ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthffreecomma ( FFREE, )
//
// C prototype:
//  void dg_forthffreecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that changes the value in the floating point tag register for
//   the target floating point register to indicated the floating point register
//   is free. Only the two bits for the target floating point register in the tag
//   register are affected.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthficomtost0comma ( FICOM, FICOM->ST0, )
//
// C prototype:
//  void dg_forthficomtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares the integer value at the target with the value
//   in ST0.
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying to compare values that are not numbers unordered.)
//    This can happen if the value in ST0 is not a number, or is in
//    an unsupported format. When C2 is set, C0 and C3 are also set.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthficomptost0comma ( FICOMP, FICOMP->ST0, )
//
// C prototype:
//  void dg_forthficomptost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares the integer value at the target with the value
//   in ST0, then pops the floating point stack.
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying to compare values that are not numbers unordered.)
//    This can happen if the value in ST0 is not a number, or is in
//    an unsupported format. When C2 is set, C0 and C3 are also set.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfildcomma ( FILD, )
//
// C prototype:
//  void dg_forthfildcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that pushes the integer value in the target to the
//   flaoting point stack. After the compiled code is executed, the floating
//   point representation of the integer value is in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfincstpcomma ( FINCSTP, )
//
// C prototype:
//  void dg_forthfincstpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that increments the floating point stack pointer.
//  The floating point stack pointer is a 3 bit value (0-7).
//  Nothing else is affected.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfinitcomma ( FINIT, )
//
// C prototype:
//  void dg_forthfinitcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Initializes the floating point unit.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfninitcomma ( FNINIT, )
//
// C prototype:
//  void dg_forthfninitcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Initializes the floating point unit without checking to see if there
//   are any pending floating point exceptions.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfistcomma ( FIST, )
//
// C prototype:
//  void dg_forthfistcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that rounds the floating point value in ST0 to an integer
//   using the current rounding mode indicated in the RC field of the floating
//   point control register and stores the integer in the target memory location.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfistpcomma ( FISTP, )
//
// C prototype:
//  void dg_forthfistpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that rounds the floating point value in ST0 to an integer
//   using the current rounding mode indicated in the RC field of the floating
//   point control register, stores the integer in the target memory location,
//   and then drops the top value off the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfisttpcomma ( FISTTP, )
//
// C prototype:
//  void dg_forthfisttpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that converts the floating point value in ST0 to an integer
//   using truncation, stores the integer in the target memory location,
//   and then drops the top value off the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfldcomma ( FLD, )
//
// C prototype:
//  void dg_forthfldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   80BIT                        sets the data size of the instruction to
//                                 10 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that pushes floating point value in the target onto the
//   floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfld1comma ( FLD1, )
//
// C prototype:
//  void dg_forthfld1comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that pushes 1.0 onto the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfld2tcomma ( FLD2T, )
//
// C prototype:
//  void dg_forthfld2tcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that pushes the log base 2 of 10 onto the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfld2ecomma ( FLD2E, )
//
// C prototype:
//  void dg_forthfld2ecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that pushes the log base 2 of e onto the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfldpicomma ( FLDPI, )
//
// C prototype:
//  void dg_forthfldpicomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that pushes pi onto the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfldg2comma ( FLDLG2, )
//
// C prototype:
//  void dg_forthfldg2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that pushes the log base 10 of 2 onto the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfldn2comma ( FLDLN2, )
//
// C prototype:
//  void dg_forthfldn2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that pushes the log base e of 2 onto the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfldzcomma ( FLDZ, )
//
// C prototype:
//  void dg_forthfldzcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that pushes +0.0 onto the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfldcwcomma ( FLDCW, )
//
// C prototype:
//  void dg_forthfldcwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that copies the target to the floating point control word.
//  The data size for this instruction is 16 bits.
//  Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Data:
//  Floating point control register:
//   bits 15, 14, 13, 7, 6    usused
//   bit 12                   infinity control (not used anymore)
//   bits 11,10               rounding control
//                             00 = round to nearest
//                             01 = down towoards -ininity
//                             10 = up towards +ininity
//                             11 = truncate
//   bits 9,8                 precision control
//                             00 = 24
//                             01 = reserved
//                             10 = 53
//                             11 = 64 is the default
//   bit 5                    precision exception mask
//   bit 4                    stack underflow exception mask
//   bit 3                    overflow exception mask
//   bit 2                    zero divide exception mask
//   bit 1                    denormal operand exception mask
//   bit 0                    invalid operand exception mask
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfldenvcomma ( FLDENV, )
//
// C prototype:
//  void dg_forthfldenvcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that loads the floating point environment from a memory
//   target. In 64 bit mode, the memory target is 28 bytes.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfmultost0comma ( FMUL, FMUL->ST0, )
//
// C prototype:
//  void dg_forthfmultost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that multiplies the floating point value in the target by ST0,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfmulst0tocomma ( FMULST0->, )
//
// C prototype:
//  void dg_forthfmulst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that multiplies ST0 by value in the target floating point register,
//   then stores the result to the target floating point register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfmulpst0tocomma ( FMULPST0->, )
//
// C prototype:
//  void dg_forthfmulpst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that multiplies ST0 by value in the target floating point register,
//   then stores the result to the target floating point register.
//   Then the top value on the floating point stack is dropped.
//   If ST0 was empty before the compiled code executes, then the #IS exception
//    is generated.
//   When a floating point stack underflow occurred, the C1 register is set.
//   I wonder what happens if the source floating point register is empty?
//   Is C1 only set when the #IS exception is masked?
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfimultost0comma ( FIMUL, FIMUL->ST0, )
//
// C prototype:
//  void dg_forthfimultost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that multiplies the integer value in the target by ST0,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfnopcomma ( FNOP, )
//
// C prototype:
//  void dg_forthfnopcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does nothing.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfpatancomma ( FPATAN, )
//
// C prototype:
//  void dg_forthfpatancomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST1 with the arctangent(ST1/ST0),
//   then pops the floating point stack. So after the compiled code is
//   executed, the result in radians is on top of the stack in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfpremcomma ( FPREM, )
//
// C prototype:
//  void dg_forthfpremcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with the remainder from
//   dividing ST0 by ST1. The calculation is done this way:
//   Remainder = ST0 - (ST1 * INT(ST0/ST1)) where INT(x) is integer tructaction
//   towards 0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfprem1comma ( FPREM1, )
//
// C prototype:
//  void dg_forthfprem1comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with the IEEE remainder from
//   dividing ST0 by ST1. The calculation is done this way:
//   Remainder = ST0 - (ST1 * RND(ST0/ST1)) where RND(x) is rounding to the
//   nearest integer.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfptancomma ( FPTAN, )
//
// C prototype:
//  void dg_forthfptancomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with it's tangent, then pushes
//   1.0 to the floating point stack.
//   The value is in radians and must be less than +-2 ^ 63.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthframecommacurly ( FRAME,> )
//
// C prototype:
//  void dg_forthframecommacurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Execute mode action:
//  Moves the current offset pointer for the current input buffer to the character 
//   after the next > or to the end of the buffer if > is not found.
//  Then the frame_offset is set to 0.
//  For each word found before the > or end of buffer, whichever come first:
//   Tries to convert the word to a number using the current BASE. 
//   If the word can be converted to a number, it is pushed to the data stack.
/    If the word can not be converted to a number, 
//    If the depth of the data stack is greater than the starting depth, the number
//     of cells the local variable uses is popped off the data stack, otherwise the
//     number of cells the local variable uses is 1. 
//    Then the number of bytes the local variable uses is subtracted from the 
//     frame_offset.
//    Then creates a new definition in the local wordlist that:
//     Pushes the addressing mode sequence frame_offset N [R+B] onto the data stack.
//  After all the words are found, then this compiles code that initializes the 
//   subroutine frame and compiles code that subtracts the number of bytes the 
//   local variables use from the return stack.
//
// Compile mode action:
//  Compiles a call to the routine that does the above action.
//
// Note:
//  No whitespace is needed before or after the terminating >
//  This means the local variable names can not have the > character in them.
//
// Note:
//  The subroutine frame initialization looks like:
//   0 N  cellsused*sizeof(UINT64) N  ENTER,
//
//  which is equivalent to:
//   RBP PUSH,
//   RSP  RBP  MOV,
//   cellsused*sizeof(UINT64) N  RSP  SUB,
//
// Note:
//  Cell size not checked for errors.
//
// Examples:
//  CODE useUINT64localvariables
//   FRAME,>
//    x
//    y <
//   RDI   x    MOV,  // store first parameter in local variable x
//   RSI   y    MOV,  // store second parameter in local variable y
//   x     RAX  MOV,  // move contents of local variable x to RAX
//   y     RAX  ADD,  // add contents of local variable y to RAX
//   LEAVE,           // undo the stack frame
//   RET,
//  END-CODE
//
//  CODE useUINT128localvariables
//   FRAME,>
//    2 x
//    2 y <
//   RDI   x    MOV,         // store first parameter in local variable x 
//   RSI   y    MOV,         // store second parameter in local variable y
//   x  RDI  LEA,            // get address of x into RDI
//   0 N  RDI 8 [R+N]  MOV,  // store 0 in local variable x hi
//   y  RSI  LEA,            // get address of y into RAX
//   0 N  RSI 8 [R+N]  MOV,  // store 0 in local variable y hi
//
//   x  RAX  MOV,            // move contents of local variable x lo to RAX
//   RDI 8 [R+N]  RDX  MOV,  // move contents of local variable x hi to RDX
//   y  RAX  ADD,            // add contents of local variable y lo to RAX
//   RSI 8 [R+N]  RDX  ADC,  // add contents of local variable y hi to RDX plus carry 
//   LEAVE,           // undo the stack frame
//   RET,
//  END-CODE  
//  
////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthframecurly ( FRAME> )
//
// C prototype:
//  void dg_forthframecurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Execute mode action:
//  Moves the current offset pointer for the current input buffer to the character 
//   after the next > or to the end of the buffer if > is not found.
//  Then the frame_offset is set to 0.
//  For each word found before the > or end of buffer, whichever come first:
//   Tries to convert the word to a number using the current BASE. 
//   If the word can be converted to a number, it is pushed to the data stack.
/    If the word can not be converted to a number, 
//    If the depth of the data stack is greater than the starting depth, the number
//     of cells the local variable uses is popped off the data stack, otherwise the
//     number of cells the local variable uses is 1. 
//    Then the number of bytes the local variable uses is subtracted from the 
//     frame_offset.
//    Then creates a new definition in the local wordlist that:
//     Pushes the addressing mode sequence frame_offset N [R+B] onto the data stack.
//
// Compile mode action:
//  Compiles a call to the routine that does the above action.
//
// Note:
//  No whitespace is needed before or after the terminating >
//  This means the local variable names can not have the > character in them.
//
// Note:
//  Cell size not checked for errors.
//
// Examples:
//  HEX
//  CODE useUINT64localvariables
//   FRAME>
//    x
//    y <
//   RBP PUSH,
//   RSP RBP MOV,
//   10 N  RSP  SUB,
//   RDI   x    MOV,  // store first parameter in local variable x
//   RSI   y    MOV,  // store second parameter in local variable y
//   x     RAX  MOV,  // move contents of local variable x to RAX
//   y     RAX  ADD,  // add contents of local variable y to RAX
//   RBP RSP MOV,     // undo the stack frame
//   RBP POP,
//   RET,
//  END-CODE
//
//  CODE useUINT128localvariables
//   FRAME>
//    2 x
//    2 y <
//   0 N  20 N  ENTER,
//   RDI   x    MOV,         // store first parameter in local variable x 
//   RSI   y    MOV,         // store second parameter in local variable y
//   x  RDI  LEA,            // get address of x into RDI
//   0 N  RDI 8 [R+N]  MOV,  // store 0 in local variable x hi
//   y  RSI  LEA,            // get address of y into RAX
//   0 N  RSI 8 [R+N]  MOV,  // store 0 in local variable y hi
//
//   x  RAX  MOV,            // move contents of local variable x lo to RAX
//   RDI 8 [R+N]  RDX  MOV,  // move contents of local variable x hi to RDX
//   y  RAX  ADD,            // add contents of local variable y lo to RAX
//   RSI 8 [R+N]  RDX  ADC,  // add contents of local variable y hi to RDX plus carry 
//   LEAVE,           // undo the stack frame
//   RET,
//  END-CODE
//  
////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlocalbracketrbppncurly ( LOCAL-[RBP+N]> )
//
// C prototype:
//  void dg_forthlocalbracketrbppncurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Execute mode action:
//  Moves the current offset pointer for the current input buffer to the character 
//   after the next > or to the end of the buffer if > is not found.
//  For each word found before the > or end of buffer, whichever come first:
//   Tries to convert the word to a number using the current BASE. 
//   If the word can be converted to a number, it is pushed to the data stack.
/    If the word can not be converted to a number, 
//    The frame_offset is popped off the data stack.
//    Then creates a new definition in the local wordlist that:
//     Pushes the addressing mode sequence frame_offset N [R+B] onto the data stack.
//
// Compile mode action:
//  Compiles a call to the routine that does the above action.
//
// Note:
//  No whitespace is needed before or after the terminating >
//  This means the local variable names can not have the > character in them.
//
// Note:
//  Cell size not checked for errors.
//
// Examples:
//  HEX
//  CODE useUINT64localvariables
//   LOCAL-[RBP+N]>
//    -8 x
//    -10 y <
//   RBP PUSH,
//   RSP RBP MOV,
//   10 N  RSP  SUB,
//   RDI   x    MOV,  // store first parameter in local variable x
//   RSI   y    MOV,  // store second parameter in local variable y
//   x     RAX  MOV,  // move contents of local variable x to RAX
//   y     RAX  ADD,  // add contents of local variable y to RAX
//   RBP RSP MOV,     // undo the stack frame
//   RBP POP,
//   RET,
//  END-CODE
//
//  CODE useUINT128localvariables
//   LOCAL-[RBP+N]>
//    -10 x
//    -20 y <
//   0 N  20 N  ENTER,
//   RDI   x    MOV,         // store first parameter in local variable x 
//   RSI   y    MOV,         // store second parameter in local variable y
//   x  RDI  LEA,            // get address of x into RDI
//   0 N  RDI 8 [R+N]  MOV,  // store 0 in local variable x hi
//   y  RSI  LEA,            // get address of y into RAX
//   0 N  RSI 8 [R+N]  MOV,  // store 0 in local variable y hi
//
//   x  RAX  MOV,            // move contents of local variable x lo to RAX
//   RDI 8 [R+N]  RDX  MOV,  // move contents of local variable x hi to RDX
//   y  RAX  ADD,            // add contents of local variable y lo to RAX
//   RSI 8 [R+N]  RDX  ADC,  // add contents of local variable y hi to RDX plus carry 
//   LEAVE,           // undo the stack frame
//   RET,
//  END-CODE
//  
////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfrndintcomma ( FRNDINT, )
//
// C prototype:
//  void dg_forthfrndintcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with the value in ST0 rounded
//   to the nearest integer using the rounding mode specified by the RC field
//   of the FPU control word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfrstorcomma ( FRSTOR, )
//
// C prototype:
//  void dg_forthfrstorcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that:
//   Restores the floating point state from a memory target.
//   In 64 bit mode, the memory target is 108 bytes.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsavecomma ( FSAVE, )
//
// C prototype:
//  void dg_forthfsavecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that:
//   checks for and handles floating point exceptions ( WAIT, )
//   then stores the floating point state to a memory target.
//   In 64 bit mode, the memory target is 108 bytes.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfnsavecomma ( FNSAVE, )
//
// C prototype:
//  void dg_forthfnsavecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that:
//   Stores the floating point state to a memory target.
//   In 64 bit mode, the memory target is 108 bytes.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfscalecomma ( FSCALE, )
//
// C prototype:
//  void dg_forthfscalecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that truncates the value in ST1 and adds the integer result to
//   the exponent in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsincomma ( FSIN, )
//
// C prototype:
//  void dg_forthfsincomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with it's sine.
//  The value is in radians.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsincoscomma ( FSINCOS, )
//
// C prototype:
//  void dg_forthfsincoscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with the value's sine,
//   then pushes the value's cosine to the floating point stack.
//  So after compiled code is executed, ST0 holds the value's cosine,
//   and ST1 holds the value's sine.
//  The value is in radians.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsqrtcomma ( FSQRT, )
//
// C prototype:
//  void dg_forthfsqrtcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with the value's square root.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfstcomma ( FST, )
//
// C prototype:
//  void dg_forthfstcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that copies the value in ST0 to the memory target as a
//   floating point value.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfstpcomma ( FSTP, )
//
// C prototype:
//  void dg_forthfstpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   80BIT                        sets the data size of the instruction to
//                                 10 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that pops the top value off the floating point stack and
//   stores it to the memory target as a floating point value.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfstcwcomma ( FSTCW, )
//
// C prototype:
//  void dg_forthfstcwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that:
//   checks for and handles unmasked floationg point exceptions ( WAIT, )
//   then copies the floating point control word to the target.
//  The data size for this instruction is 16 bits.
//  Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Data:
//  Floating point control register:
//   bits 15, 14, 13, 7, 6    usused
//   bit 12                   infinity control (not used anymore)
//   bits 11,10               rounding control
//                             00 = round to nearest
//                             01 = down towoards -ininity
//                             10 = up towards +ininity
//                             11 = truncate
//   bits 9,8                 precision control
//                             00 = 24
//                             01 = reserved
//                             10 = 53
//                             11 = 64 is the default
//   bit 5                    precision exception mask
//   bit 4                    stack underflow exception mask
//   bit 3                    overflow exception mask
//   bit 2                    zero divide exception mask
//   bit 1                    denormal operand exception mask
//   bit 0                    invalid operand exception mask
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfnstcwcomma ( FNSTCW, )
//
// C prototype:
//  void dg_forthfnstcwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that copies the floating point control word to the target.
//  The data size for this instruction is 16 bits.
//  Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Data:
//  Floating point control register:
//   bits 15, 14, 13, 7, 6    usused
//   bit 12                   infinity control (not used anymore)
//   bits 11,10               rounding control
//                             00 = round to nearest
//                             01 = down towoards -ininity
//                             10 = up towards +ininity
//                             11 = truncate
//   bits 9,8                 precision control
//                             00 = 24
//                             01 = reserved
//                             10 = 53
//                             11 = 64 is the default
//   bit 5                    precision exception mask
//   bit 4                    stack underflow exception mask
//   bit 3                    overflow exception mask
//   bit 2                    zero divide exception mask
//   bit 1                    denormal operand exception mask
//   bit 0                    invalid operand exception mask
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfstenvcomma ( FSTENV, )
//
// C prototype:
//  void dg_forthfstenvcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that:
//   checks for and handles floating point exceptions ( WAIT, )
//   then stores the floating point environment to a memory target.
//   In 64 bit mode, the memory target is 28 bytes.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfnstenvcomma ( FNSTENV, )
//
// C prototype:
//  void dg_forthfnstenvcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that stores the floating point environment to a memory
//   target. In 64 bit mode, the memory target is 28 bytes.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfstswcomma ( FSTSW, )
//
// C prototype:
//  void dg_forthfstswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that:
//   checks for and handles unmaksed floating point exceptions ( WAIT, )
//   then stores the floating point status register to a 16 bit memory target.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Data:
//  Floating Point Status Register
//   bit 15         FPU busy
//   bit 14         C3  ( used as the zero flag for compares )
//   bits 13,12,11  top of stack pointer
//   bit 10         C2
//   bit 9          C1
//   bit 8          C0 ( used like the needed a borrow flag for compares )
//   bit 7          error summary status
//   bit 6          stack fault
//   bit 5          precision exception flag
//   bit 4          stack underflow exception flag
//   bit 3          overflow exception flag
//   bit 2          zero divide exception flag
//   bit 1          denomralized operand exception flag
//   bit 0          invalid operation exception flag
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfnstswcomma ( FNSTSW, )
//
// C prototype:
//  void dg_forthfnstswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that stores the floating point status register to a 16 bit
//   memory target.
//   Data size specifiers are ignored for this word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Data:
//  Floating Point Status Register
//   bit 15         FPU busy
//   bit 14         C3  ( used as the zero flag for compares )
//   bits 13,12,11  top of stack pointer
//   bit 10         C2
//   bit 9          C1
//   bit 8          C0 ( used like the needed a borrow flag for compares )
//   bit 7          error summary status
//   bit 6          stack fault
//   bit 5          precision exception flag
//   bit 4          stack underflow exception flag
//   bit 3          overflow exception flag
//   bit 2          zero divide exception flag
//   bit 1          denomralized operand exception flag
//   bit 0          invalid operation exception flag
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfnstswtoaxcomma ( FNSTSW->AX, )
//
// C prototype:
//  void dg_forthfnstswtoaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that stores the FPU status word in AX without checking for
//   pending floating point exceptions.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfstswtoaxcomma ( FSTSW->AX, )
//
// C prototype:
//  void dg_forthfstswtoaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that stores the floating point status register to the AX
//   register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Data:
//  Floating Point Status Register
//   bit 15         FPU busy
//   bit 14         C3  ( used as the zero flag for compares )
//   bits 13,12,11  top of stack pointer
//   bit 10         C2
//   bit 9          C1
//   bit 8          C0 ( used like the needed a borrow flag for compares )
//   bit 7          error summary status
//   bit 6          stack fault
//   bit 5          precision exception flag
//   bit 4          stack underflow exception flag
//   bit 3          overflow exception flag
//   bit 2          zero divide exception flag
//   bit 1          denomralized operand exception flag
//   bit 0          invalid operation exception flag
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsubtost0comma ( FSUB, FSUB->ST0, )
//
// C prototype:
//  void dg_forthfsubtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does ST0 minus the floating point value in the target,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsubst0tocomma ( FSUBST0->, )
//
// C prototype:
//  void dg_forthfsubst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does the value in the target floating point register
//  minus ST0, then stores the result to the target floating point register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsubpst0tocomma ( FSUBPST0->, )
//
// C prototype:
//  void dg_forthfsubpst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does the value in the target floating point register
//   minus ST0, then stores the result to the target floating point register.
//   Then the top value on the floating point stack is dropped.
//   If ST0 was empty before the compiled code executes, then the #IS exception
//    is generated.
//   When a floating point stack underflow occurred, the C1 register is set.
//   I wonder what happens if the source floating point register is empty?
//   Is C1 only set when the #IS exception is masked?
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfisubtost0comma ( FISUB, FISUB->ST0, )
//
// C prototype:
//  void dg_forthfisubtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does ST0 minus the integer value in the target,
//   then stores the result to ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsubrtost0comma ( FSUBR, FSUBR->ST0, )
//
// C prototype:
//  void dg_forthfsubrtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does the floating point value in the target minus ST0,
//   then stores the result in ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsubrst0tocomma ( FSUBRST0->, )
//
// C prototype:
//  void dg_forthfsubrst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does ST0 minus the value in the target floating point
//   register, then stores the result to the target floating point register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfsubrpst0tocomma ( FSUBRPST0->, )
//
// C prototype:
//  void dg_forthfsubrpst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   FPSR                         specifies a floating point stack register
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does ST0 minus the value in the target floating point
//   register, then stores the result to the target floating point register.
//   Then the top value on the floating point stack is dropped.
//   If ST0 was empty before the compiled code executes, then the #IS exception
//    is generated.
//   When a floating point stack underflow occurred, the C1 register is set.
//   I wonder what happens if the source floating point register is empty?
//   Is C1 only set when the #IS exception is masked?
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfisubrtost0comma ( FISUBR, FISUBR->ST0, )
//
// C prototype:
//  void dg_forthfisubrtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   FPSR                         specifies a floating point stack register
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that does the integer value in the target minus ST0,
//   then stores the result to ST0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthftstcomma ( FTST, )
//
// C prototype:
//  void dg_forthftstcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that compares ST0 with 0.
//   The result of the compare is in the floating point status register flags.
//    If ST0 < 0 then C0 is set.
//    If ST0 = 0 then C3 is set.
//    If ST0 is not a number or in an unsupported format, C2 is set
//     along with C1 and C2
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfucomtost0comma ( FUCOM, FUCOM->ST0, )
//
// C prototype:
//  void dg_forthfucomtost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere.
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format. WHen C2 is set, C0 and C3 are also set.
//   This instruction is the same as FCOM except QNaNs do not cause an exception.
//   SNaNs and unsupported formats still cause an exception like FCOM.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfucomptost0comma ( FUCOMP, FUCOMP->ST0, )
//
// C prototype:
//  void dg_forthfucomptost0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that compares ST0 with the value in the target. This is the
//   same as subtracting the target's value from ST0 but not storing the
//   result anywhere. After doing the compare, the compiled code drops the
//   top value from the floating point stack.
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format. WHen C2 is set, C0 and C3 are also set.
//   This instruction is the same as FCOM except QNaNs do not cause an exception.
//   SNaNs and unsupported formats still cause an exception like FCOM.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfucomppcomma ( FUCOMPP, )
//
// C prototype:
//  void dg_forthfucomppcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that compares ST0 with ST1. This is the
//   same as subtracting ST1 from ST0 but not storing the result.
//   After doing the compare, the compiled code drops the
//   top two values from the floating point stack.
//  Status flags in the floating point status register are changed according
//   to the result of the compare.
//   C0 is set if ST0 >= target's value, cleared otherwise.
//   C3 is set if ST0 = target's value, cleared otherwise.
//   C2 is set if the compare could not be done and unordered exceptions are
//    masked. (Docs call trying compare values that are not numbers unordered.)
//    This can happen if one or both of the values is not a number, or is in
//    an unsupported format. WHen C2 is set, C0 and C3 are also set.
//   This instruction is the same as FCOM except QNaNs do not cause an exception.
//   SNaNs and unsupported formats still cause an exception like FCOM.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfxamcomma ( FXAM, )
//
// C prototype:
//  void dg_forthfxamcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that examines the contents of ST0 and sets the floating point
//   status flags according to the result.
//                            C3 C2 C0
//   unsupported format        0  0  0
//   not a number (NaN)        0  0  1
//   normal fininte number     0  1  0
//   ininity                   0  1  1
//   zero                      1  0  0
//   empty                     1  0  1
//   denormal number           1  1  1
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfswapcomma ( FXCH, FSWAP, )
//
// C prototype:
//  void dg_forthfswapcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that exchanges the contents of ST0 with the contents of ST1.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfxchst0tocomma ( FXCH->ST0, FXCHST0-> )
//
// C prototype:
//  void dg_forthfxchst0tocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( floatingpointregister -- )
//
// Data stack in:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
// Data stack out:
//  none
//
// Execute state action:
//  Compiles code that exchanges the value in ST0 with the value in the target.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfxrstorcomma ( FXRSTOR, )
//
// C prototype:
//  void dg_forthfxrstorcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 FXRSTOR instruction. This opcode sequence loads the floating point,
//   xmm, and mxcr registers from memory that was saved using the FXSAVE
//   instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  FXRSTOR,          // [RAX] -> STx, FPx, MXCSR,
//                             //  and probably more...
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfxsavecomma ( FXSAVE, )
//
// C prototype:
//  void dg_forthfxsavecomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 FXSAVE instruction. This opcode sequence copies the floating point,
//   xmm, and mxcr registers to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  FXSAVE,          //  STx, FPx, MXCSR,
//                            //  and probably more...  -> [RAX]
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfxtractcomma ( FXTRACT, )
//
// C prototype:
//  void dg_forthfxtractcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST0 with the value's exponent, then
//   pushes the value's mantissa to the floating point stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfyl2xcomma ( FYL2X, )
//
// C prototype:
//  void dg_forthfyl2xcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST1 with ST1 * log base 2 (ST0),
//   then drops the top value off the floating point stack. The value in ST0
//   can't be 0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfyl2xp1comma ( FYl2XP1, )
//
// C prototype:
//  void dg_forthfy2lxp1comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that replaces the value in ST1 with ST1 * log base 2 (ST0 + 1),
//   then drops the top value off the floating point stack. The value in ST0
//   can't be 0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfdupcomma ( FDUP, )
//
// C prototype:
//  void dg_forthfdupcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code pushes the contents of ST0 onto the top of the floating point
//   stack. This is the same as doing ST0 FLD,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthhaddpdcomma ( HADDPD, )
//
// C prototype:
//  void dg_forthhaddpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 HADDPD instruction. This sequence fadds the two double precision
//   floating point values in the destination and stores the result
//   to the lower 64 bits of the destination, and also fadds the two double
//   precision floating point values in the source and stores the result to
//   the upper 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  HADDPD,    // XMM0[63:0] fadd XMM0[127:64] -> XMM0[63:0]
//                            // [RBX][63:0] fadd [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM0  HADDPD,       // XMM0[63:0] fadd XMM0[127:64] -> XMM0[63:0]
//                            // XMM2[63:0] fadd XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM0  HADDPD, // XMM2[63:0] fadd XMM2[127:64] -> XMM2[63:0]
//                            // XMM0[63:0] fadd XMM0[127:64] -> XMM2[127:64]
//
//  XMM0  XMM8  HADDPD,       // XMM8[63:0] fadd XMM8[127:64] -> XMM8[63:0]
//                            // XMM0[63:0] fadd XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvhaddpdcomma ( VHADDPD, )
//
// C prototype:
//  void dg_forthvhaddpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VHADDPD instruction. If the destination target is an xmm register,
//   This sequence fadds the two double precision floating point values in 
//   target y and stores the result to the lower 64 bits of the destination,
//   and also fadds the two double precision floating point values in the source 
//   and stores the result to the upper 64 bits of the lower 128 bits of the
//   destination. If the destination is a ymm register, the data from the lower 
//   128 bits of the sources ends up the same way, then this instruction fadds 
//   the two double precision floating point values in the upper 128 bits of 
//   target y and puts the result into the lower 64 bits of the upper 128 bits 
//   of the destination. Then this instruction fadds the two double precision
//   values in the upper 128 bits of the source then puts the result into the
//   upper 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VHADDPD, 
//    // XMM1[63:0]  fadd XMM1[127:64]    -> XMM0[63:0]
//    // [RBX][63:0] fadd [RBX][127:64]   -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VHADDPD,
//    // XMM1[63:0] fadd XMM1[127:64] -> XMM0[63:0]
//    // XMM2[63:0] fadd XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM1  XMM0  VHADDPD, 
//    // XMM1[63:0] fadd XMM1[127:64] -> XMM2[63:0]
//    // XMM0[63:0] fadd XMM0[127:64] -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VHADDPD,       
//    // YMM1[63:0]    fadd YMM1[127:64]  -> YMM8[63:0]
//    // YMM0[63:0]    fadd YMM0[127:64]  -> YMM8[127:64]
//    // YMM1[191:128] fadd YMM1[255:192] -> YMM8[191:128]
//    // YMM0[191:128] fadd YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthhaddpscomma ( HADDPS, )
//
// C prototype:
//  void dg_forthhaddpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 HADDPS instruction.
//   This sequence fadds the two single precision floating point values in the
//    low 64 bits of the destination and stores the result to the lower 32 bits
//    of the destination.
//   This sequence also fadds the two single precision floating point values in
//    the upper 64 bits of the destination and stores the result to the lower
//    middle 32 bits of the destination.
//   This sequence fadds the two single precision floating point values in the
//    low 64 bits of the source and stores the result to the upper middle 32
//    bits of the destination.
//   This sequence also fadds the two single precision floating point values in
//    the upper 64 bits of the source and stores the result to the lower
//    upper 32 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  HADDPS,    // XMM0[31:0] fadd XMM0[63:32] -> XMM0[31:0]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM0[63:32]
//                            // [RBX][31:0] fadd [RBX][63:32] -> XMM0[95:64]
//                            // [RBX][95:64] fadd [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  HADDPS,       // XMM0[31:0] fadd XMM0[63:32] -> XMM0[31:0]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM0[63:32]
//                            // XMM2[63:0] fadd XMM2[63:32] -> XMM0[95:64]
//                            // XMM2[95:64] fadd XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  HADDPS, // XMM2[31:0] fadd XMM2[63:32] -> XMM2[31:0]
//                            // XMM2[95:64] fadd XMM2[127:96] -> XMM2[63:32]
//                            // XMM0[31:0] fadd XMM0[63:32] -> XMM2[95:64]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM8  HADDPS,       // XMM8[31:0] fadd XMM8[63:32] -> XMM8[31:0]
//                            // XMM8[95:64] fadd XMM8[127:96] -> XMM8[63:32]
//                            // XMM0[31:0] fadd XMM0[63:32] -> XMM8[95:64]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvhaddpscomma ( VHADDPS, )
//
// C prototype:
//  void dg_forthvhaddpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VHADDPS instruction.
//   This sequence fadds the two single precision floating point values in the
//    low 64 bits of target y and stores the result to the first 32 bit section
//    (lowest 32 bit section) of the destination.
//   This sequence also fadds the two single precision floating point values in
//    the upper 64 bits of the lower 128 bits of the destination and stores the 
//    result to the third 32 bit section of the destination.
//   This sequence fadds the two single precision floating point values in the
//    low 64 bits of the source and stores the result to the second 32 bit 
//    section of the destination.
//   This sequence also fadds the two single precision floating point values in
//    the upper 64 bits of the lower 128 bits of the source and stores the 
//   result to the fourth 32 bit section of the destination.
//   If the destination is a ymm register, then this instruction does the same
//    thing with the upper 128 bit halves of the source, target y, and the 
//    destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VHADDPS,    
//     // XMM1[31:0]   fadd XMM1[63:32]   -> XMM0[31:0]
//     // XMM1[95:64]  fadd XMM1[127:96]  -> XMM0[63:32]
//     // [RBX][31:0]  fadd [RBX][63:32]  -> XMM0[95:64]
//     // [RBX][95:64] fadd [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VHADDPS,       
//     // XMM1[31:0]  fadd XMM1[63:32]  -> XMM0[31:0]
//     // XMM1[95:64] fadd XMM1[127:96] -> XMM0[63:32]
//     // XMM2[63:0]  fadd XMM2[63:32]  -> XMM0[95:64]
//     // XMM2[95:64] fadd XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VHADDPS, 
//     // XMM1[31:0]  fadd XMM1[63:32]  -> XMM2[31:0]
//     // XMM1[95:64] fadd XMM1[127:96] -> XMM2[63:32]
//     // XMM0[31:0]  fadd XMM0[63:32]  -> XMM2[95:64]
//     // XMM0[95:64] fadd XMM0[127:96] -> XMM2[127:96]
//
//  YMM0  YMM1  YMM8  VHADDPS,       
//     // YMM1[31:0]    fadd YMM1[63:32]   -> YMM8[31:0]
//     // YMM1[95:64]   fadd YMM1[127:96]  -> YMM8[63:32]
//     // YMM0[31:0]    fadd YMM0[63:32]   -> YMM8[95:64]
//     // YMM0[95:64]   fadd YMM0[127:96]  -> YMM8[127:96]
//     // YMM1[159:128] fadd YMM1[191:160] -> YMM8[159:128]
//     // YMM1[223:192] fadd YMM1[255:224] -> YMM8[191:160]
//     // YMM0[159:128] fadd YMM0[191:160] -> YMM8[223:192]
//     // YMM0[255:192] fadd YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//  must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthhltcomma ( HLT, )
//
// C prototype:
//  void dg_forthhltcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 HLT instruction.
//  This opcode sequence halts the processor if the privelege level is 0. Only
//   an enabled interrupt, non maskable interrupt (NMI), or reset can resume
//   execution.
//
// Note:
//  I did not test this instruction :-)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  HLT,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthhsubpdcomma ( HSUBPD, )
//
// C prototype:
//  void dg_forthhsubpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 HSUBPD instruction. This sequence fsubs the upper double precision
//   floating point value in the destination from the lower double precision
//   floating point value in the destination and stores the result to the
//   lower 64 bits of the destination, and also fsubs the upper double
//   precision floating point value in the source from the lower double
//   precision floating point value in the source and stores the result to
//   the upper 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  HSUBPD,    // XMM0[63:0] fsub XMM0[127:64] -> XMM0[63:0]
//                            // [RBX][63:0] fsub [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM0  HSUBPD,       // XMM0[63:0] fsub XMM0[127:64] -> XMM0[63:0]
//                            // XMM2[63:0] fsub XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM0  HSUBPD, // XMM2[63:0] fsub XMM2[127:64] -> XMM2[63:0]
//                            // XMM0[63:0] fsub XMM0[127:64] -> XMM2[127:64]
//
//  XMM0  XMM8  HSUBPD,       // XMM8[63:0] fsub XMM8[127:64] -> XMM8[63:0]
//                            // XMM0[63:0] fsub XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvhsubpdcomma ( VHSUBPD, )
//
// C prototype:
//  void dg_forthvhsubpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VHSUBPD instruction. If the destination target is an xmm register,
//   This sequence fsubs the two double precision floating point values in 
//   target y and stores the result to the lower 64 bits of the destination,
//   and also fsubs the two double precision floating point values in the source 
//   and stores the result to the upper 64 bits of the lower 128 bits of the
//   destination. If the destination is a ymm register, the data from the lower 
//   128 bits of the sources ends up the same way, then this instruction fsubs 
//   the two double precision floating point values in the upper 128 bits of 
//   target y and puts the result into the lower 64 bits of the upper 128 bits 
//   of the destination. Then this instruction fsubs the two double precision
//   values in the upper 128 bits of the source then puts the result into the
//   upper 64 bits of the destination. The higher indexed values are subtracted
//    from the lower indexed values.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VHSUBPD, 
//    // XMM1[63:0]  fsub XMM1[127:64]    -> XMM0[63:0]
//    // [RBX][63:0] fsub [RBX][127:64]   -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VHSUBPD,
//    // XMM1[63:0] fsub XMM1[127:64] -> XMM0[63:0]
//    // XMM2[63:0] fsub XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM1  XMM0  VHSUBPD, 
//    // XMM1[63:0] fsub XMM1[127:64] -> XMM2[63:0]
//    // XMM0[63:0] fsub XMM0[127:64] -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VHSUBPD,       
//    // YMM1[63:0]    fsub YMM1[127:64]  -> YMM8[63:0]
//    // YMM0[63:0]    fsub YMM0[127:64]  -> YMM8[127:64]
//    // YMM1[191:128] fsub YMM1[255:192] -> YMM8[191:128]
//    // YMM0[191:128] fsub YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthhsubpscomma ( HSUBPS, )
//
// C prototype:
//  void dg_forthhsubpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 HSUBPS instruction.
//   This sequence fsubs the lower middle single precision floating point value
//    in the the destination from the lower single precision floating point value
//    in the destination and stores the result to the lower 32 bits of the
//    destination.
//   This sequence also fsubs the upper single precision floating point value
//    in the destination from the upper middle single precistion floating point
//    value in the destination and stores the result to the lower
//    middle 32 bits of the destination.
//   This sequence fsubs the lower middle single precision floating point value
//    in the the source from the lower single precision floating point value
//    in the source and stores the result to the upper middle 32 bits of the
//    destination.
//   This sequence also fsubs the upper single precision floating point value
//    in the source from the upper middle single precistion floating point
//    value in the source and stores the result to the upper 32 bits of the
//    destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  HSUBPS,    // XMM0[31:0] fadd XMM0[63:32] -> XMM0[31:0]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM0[63:32]
//                            // [RBX][31:0] fadd [RBX][63:32] -> XMM0[95:64]
//                            // [RBX][95:64] fadd [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  HSUBPS,       // XMM0[31:0] fadd XMM0[63:32] -> XMM0[31:0]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM0[63:32]
//                            // XMM2[63:0] fadd XMM2[63:32] -> XMM0[95:64]
//                            // XMM2[95:64] fadd XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  HSUBPS, // XMM2[31:0] fadd XMM2[63:32] -> XMM2[31:0]
//                            // XMM2[95:64] fadd XMM2[127:96] -> XMM2[63:32]
//                            // XMM0[31:0] fadd XMM0[63:32] -> XMM2[95:64]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM8  HSUBPS,       // XMM8[31:0] fadd XMM8[63:32] -> XMM8[31:0]
//                            // XMM8[95:64] fadd XMM8[127:96] -> XMM8[63:32]
//                            // XMM0[31:0] fadd XMM0[63:32] -> XMM8[95:64]
//                            // XMM0[95:64] fadd XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvhsubpscomma ( VHSUBPS, )
//
// C prototype:
//  void dg_forthvhsubpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VHSUBPS instruction.
//   This sequence fsubs the two single precision floating point values in the
//    low 64 bits of target y and stores the result to the first 32 bit section
//    (lowest 32 bit section) of the destination.
//   This sequence also fsubs the two single precision floating point values in
//    the upper 64 bits of the lower 128 bits of the destination and stores the 
//    result to the third 32 bit section of the destination.
//   This sequence fsubs the two single precision floating point values in the
//    low 64 bits of the source and stores the result to the second 32 bit 
//    section of the destination.
//   This sequence also fsubs the two single precision floating point values in
//    the upper 64 bits of the lower 128 bits of the source and stores the 
//   result to the fourth 32 bit section of the destination.
//   If the destination is a ymm register, then this instruction does the same
//    thing with the upper 128 bit halves of the source, target y, and the 
//    destination. The higher indexed values are subtracted from the lower
//    indexed values.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VHSUBPS,    
//     // XMM1[31:0]   fsub XMM1[63:32]   -> XMM0[31:0]
//     // XMM1[95:64]  fsub XMM1[127:96]  -> XMM0[63:32]
//     // [RBX][31:0]  fsub [RBX][63:32]  -> XMM0[95:64]
//     // [RBX][95:64] fsub [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VHSUBPS,       
//     // XMM1[31:0]  fsub XMM1[63:32]  -> XMM0[31:0]
//     // XMM1[95:64] fsub XMM1[127:96] -> XMM0[63:32]
//     // XMM2[63:0]  fsub XMM2[63:32]  -> XMM0[95:64]
//     // XMM2[95:64] fsub XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VHSUBPS, 
//     // XMM1[31:0]  fsub XMM1[63:32]  -> XMM2[31:0]
//     // XMM1[95:64] fsub XMM1[127:96] -> XMM2[63:32]
//     // XMM0[31:0]  fsub XMM0[63:32]  -> XMM2[95:64]
//     // XMM0[95:64] fsub XMM0[127:96] -> XMM2[127:96]
//
//  YMM0  YMM1  YMM8  VHSUBPS,       
//     // YMM1[31:0]    fsub YMM1[63:32]   -> YMM8[31:0]
//     // YMM1[95:64]   fsub YMM1[127:96]  -> YMM8[63:32]
//     // YMM0[31:0]    fsub YMM0[63:32]   -> YMM8[95:64]
//     // YMM0[95:64]   fsub YMM0[127:96]  -> YMM8[127:96]
//     // YMM1[159:128] fsub YMM1[191:160] -> YMM8[159:128]
//     // YMM1[223:192] fsub YMM1[255:224] -> YMM8[191:160]
//     // YMM0[159:128] fsub YMM0[191:160] -> YMM8[223:192]
//     // YMM0[255:192] fsub YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//  must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthidivcomma ( IDIV, )
//
// C prototype:
//  void dg_forthidivcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 IDIV instruction, which is a signed divide.
//    This opcode sequence does different actions based on the DATASIZE of the
//    instruction:
//     if size is 8BIT then AL remainder AH <- AX / target
//     if size is 16BIT then AX remainder DX <- DX:AX / target
//     if size is 32BIT then EAX remainder EDX <- EDX:EAX / target
//     if size is 64BIT then RAX remainder RDX <- RDX:RAX / target
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthjmpcomma ( JMP, )
//
// C prototype:
//  void dg_forthjmpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//
//  The parameter list for this target can contain these addressing mode specifiers:
//
//   pcrelativeoffset EIP+N
//   currentcompilebufferoffset O
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   pcrelativeoffset             32 bit value of a pc relative offset
//   currentcompilebufferoffset   32 bit value of an offset in the current
//                                 compile buffer
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   EIP+N                        specifies a pc relative offset
//   RIP+N                        specifies a pc relative offset
//   O                            specifies an offset in the current compile buffer
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a jump. This opcode sequence loads the program counter based on the 
//   type of target:
//   If the target is RIP+N, then the value is added to the address after
//    after the opcode sequence and stored in the program counter.
//   If the target is O, then the offset in the current compile buffer
//    is converted to an RIP+N offset. This offset is added to the address
//    after the opcode sequence and stored in the program counter.
//   If the target is register or memory, then the program counter is loaded
//    with the target's value.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  -5 RIP+N  JMP,  // addressofthisinstruction -> RIP
//
//  RAX  JMP,       // RAX -> RIP
//
//  RSI [R]  JMP,   // [RSI] -> RIP
//
//  bufferoffset O JMP,      // RIP+offsettobufferoffset -> RIP
//
//  bufferoffset [O]  JMP,   // [RIP+offsettobufferoffset] -> RIP
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthjmpbracketssplusn16comma ( JMP[SS]+N16, )
//
// C prototype:
//  void dg_forthjmpbracketssplusn16comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for this target y can contain a constant value:
//   farsegmentaddress
//
//  The parameter list for this target x can contain these addressing mode specifiers:
//
//   immediatevalue N
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   farsegmentaddress            16 bit integer value
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The immediate
//                                 size is always the same for this instruction
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a 'far' jump. This opcode sequence loads the program counter and code
//   segment registers based on the type of target and operating mode of the
//   cpu:
//    If the processor is in real address mode or virtual 8086 mode the program
//     counter is loaded with address from target y, and the code segment register 
//     is loaded with the value from target x
//    If the processor is in protected mode it's kind of complicated. Please
//     refer to the Intel docs for details, but in short target x points to a
//     data structure which contains the information needed to find the true
//     destination. I think the data structure might be stored in the cpu's 
//     memory but I'm not sure. I think target y is an offset that is added
//     to the base address calculated from the data structure but I'm not sure.
//    If the processor is in 64 bit mode it's similar to the protected mode
//     operation except, if the data structure is a call gate then target y's
//     value is not used otherwise target y is used as an offset from a base
//     address which comes from something called a far pointer... which I guess
//     comes from the data structure.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Note:
//  Target y is just a number, you don't need anything like N after it. Since
//   it is always a number I decided to make it so you didn't have to type in 
//   the N. Also target y in this instruction is 16 bits.
//  Target x can not be immediate in 64 bit address mode.
//  This instruction was not tested.
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthjmpbracketssplusn32comma ( JMP[SS]+N32, )
//
// C prototype:
//  void dg_forthjmpbracketssplusn32comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for this target y can contain a constant value:
//   farsegmentaddress
//
//  The parameter list for this target x can contain these addressing mode specifiers:
//
//   immediatevalue N
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   farsegmentaddress              32 bit integer value
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The immediate
//                                 size is always the same for this instruction
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a 'far' jump. This opcode sequence loads the program counter and code
//   segment registers based on the type of target and operating mode of the
//   cpu:
//    If the processor is in real address mode or virtual 8086 mode the program
//     counter is loaded with address from target y, and the code segment register 
//     is loaded with the value from target x
//    If the processor is in protected mode it's kind of complicated. Please
//     refer to the Intel docs for details, but in short target x points to a
//     data structure which contains the information needed to find the true
//     destination. I think the data structure might be stored in the cpu's 
//     memory but I'm not sure. I think target y is an offset that is added
//     to the base address calculated from the data structure but I'm not sure.
//    If the processor is in 64 bit mode it's similar to the protected mode
//     operation except, if the data structure is a call gate then target y's
//     value is not used otherwise target y is used as an offset from a base
//     address which comes from something called a far pointer... which I guess
//     comes from the data structure.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Note:
//  Target y is just a number, you don't need anything like N after it. Since
//   it is always a number I decided to make it so you didn't have to type in 
//   the N. Also target y in this instruction is 32 bits.
//  Target x can not be immediate in 64 bit address mode.
//  This instruction was not tested.
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthjmpbracketssplusn64comma ( JMP[SS]+N64, )
//
// C prototype:
//  void dg_forthjmpbracketssplusn64comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for this target y can contain a constant value:
//   farsegmentaddress
//
//  The parameter list for this target x can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   farsegmentaddress            64 bit integer value
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a 'far' jump. This opcode sequence loads the program counter and code
//   segment registers based on the type of target and operating mode of the
//   cpu:
//    If the processor is in real address mode or virtual 8086 mode the program
//     counter is loaded with address from target y, and the code segment register 
//     is loaded with the value from target x
//    If the processor is in protected mode it's kind of complicated. Please
//     refer to the Intel docs for details, but in short target x points to a
//     data structure which contains the information needed to find the true
//     destination. I think the data structure might be stored in the cpu's 
//     memory but I'm not sure. I think target y is an offset that is added
//     to the base address calculated from the data structure but I'm not sure.
//    If the processor is in 64 bit mode it's similar to the protected mode
//     operation except, if the data structure is a call gate then target y's
//     value is not used otherwise target y is used as an offset from a base
//     address which comes from something called a far pointer... which I guess
//     comes from the data structure.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Note:
//  Target y is just a number, you don't need anything like N after it. Since
//   it is always a number I decided to make it so you didn't have to type in 
//   the N. Also target y in this instruction is 64 bits.
//  Target x can not be immediate in 64 bit address mode.
//  This instruction only supported in 64 bit address mode.
//  This instruction was not tested.
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovshdupcomma ( MOVSHDUP, )
//
// C prototype:
//  void dg_forthmovshdupcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVSHDUP instruction. This sequence copies the single precision
//   floating point value in the upper 32 bits of the source to the upper
//   32 bits of the destination and also the upper middle 32 bits of the destination.
//   This sequence also copies the single precision floating point value in the
//   lower middle 32 bits of the destination to the lower middle 32 bits of the
//   destination and also the lower 32 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVSHDUP,    // RBX[63:32] -> XMM0[31:0]
//                              // RBX[63:32] -> XMM0[63:32]
//                              // RBX[127:96] -> XMM0[95:64]
//                              // RBX[127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  MOVSHDUP,       // XMM2[63:32] -> XMM0[31:0]
//                              // XMM2[63:32] -> XMM0[63:32]
//                              // XMM2[127:96] -> XMM0[95:64]
//                              // XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  MOVSHDUP, // XMM0[63:32] -> XMM2[31:0]
//                              // XMM0[63:32] -> XMM2[63:32]
//                              // XMM0[127:96] -> XMM2[95:64]
//                              // XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM8  MOVSHDUP,       // XMM0[63:32] -> XMM8[31:0]
//                              // XMM0[63:32] -> XMM8[63:32]
//                              // XMM0[127:96] -> XMM8[95:64]
//                              // XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovshdupcomma ( VMOVSHDUP, )
//
// C prototype:
//  void dg_forthvmovshdupcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVSHDUP instruction. If the destination is an xmm register, then 
//   this sequence copies the double precision floating point value in the second 
//   lowest 32 bit section of the source to the first (lowest) 32 bit section 
//   of the destination and also the second 32 bit section of the destination.
//   Then this opcode sequence also copies the fourth lowest 32 bit section of  
//   the source to the third and fourth 32 bit sections of the destination.
//   If the source is a ymm register, it copies the second lowest, fourth lowest,
//   sixth lowest, and eighth 32 bit sections to the corresponding sections
//   in the destination and also to the 32 bit section before each one in the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVSHDUP,   // RBX[63:32] -> XMM0[31:0]
//                              // RBX[63:32] -> XMM0[63:32]
//                              // RBX[127:96] -> XMM0[95:64]
//                              // RBX[127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  VMOVSHDUP,       // XMM2[63:32] -> XMM0[31:0]
//                              // XMM2[63:32] -> XMM0[63:32]
//                              // XMM2[127:96] -> XMM0[95:64]
//                              // XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  VMOVSHDUP, // XMM0[63:32] -> XMM2[31:0]
//                               // XMM0[63:32] -> XMM2[63:32]
//                               // XMM0[127:96] -> XMM2[95:64]
//                               // XMM0[127:96] -> XMM2[127:96]
//
//  YMM0  YMM8  VMOVSHDUP,       // YMM0[63:32] -> YMM8[31:0]
//                               // YMM0[63:32] -> YMM8[63:32]
//                               // YMM0[127:96] -> YMM8[95:64]
//                               // YMM0[127:96] -> YMM8[127:96]
//                               // YMM0[191:160] -> YMM8[159:128]
//                               // YMM0[191:160] -> YMM8[191:160]
//                               // YMM0[255:224] -> YMM8[223:192]
//                               // YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsldupcomma ( MOVSLDUP, )
//
// C prototype:
//  void dg_forthmovsldupcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVSLDUP instruction. This sequence copies the single precision
//   floating point value in the upper middle 32 bits of the source to the upper
//   32 bits of the destination and also the upper middle 32 bits of the destination.
//   This sequence also copies the single precision floating point value in the
//   lower 32 bits of the destination to the lower middle 32 bits of the
//   destination and also the lower 32 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVSLDUP,    // RBX[31:0] -> XMM0[31:0]
//                              // RBX[31:0] -> XMM0[63:32]
//                              // RBX[95:64] -> XMM0[95:64]
//                              // RBX[95:64] -> XMM0[127:96]
//
//  XMM2  XMM0  MOVSLDUP,       // XMM2[31:0] -> XMM0[31:0]
//                              // XMM2[31:0] -> XMM0[63:32]
//                              // XMM2[95:64] -> XMM0[95:64]
//                              // XMM2[95:64] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  MOVSLDUP, // XMM0[31:0] -> XMM2[31:0]
//                              // XMM0[31:0] -> XMM2[63:32]
//                              // XMM0[95:64] -> XMM2[95:64]
//                              // XMM0[95:64] -> XMM2[127:96]
//
//  XMM0  XMM8  MOVSLDUP,       // XMM0[31:0] -> XMM8[31:0]
//                              // XMM0[31:0] -> XMM8[63:32]
//                              // XMM0[95:64] -> XMM8[95:64]
//                              // XMM0[95:64] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovsldupcomma ( VMOVSLDUP, )
//
// C prototype:
//  void dg_forthvmovsldupcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVSLDUP instruction. If the destination is an xmm register, then 
//   this sequence copies the double precision floating point value in the  
//   lowest 32 bit section of the source to the first (lowest) 32 bit section 
//   of the destination and also the second 32 bit section of the destination.
//   Then this opcode sequence also copies the third lowest 32 bit section of 
//   the source to the third and fourth 32 bit sections of the destination.
//   If the source is a ymm register, it copies the first lowest, third lowest,
//   fifth lowest, and seventh 32 bit sections to the corresponding sections
//   in the destination and also to the 32 bit section after each one in the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVSLDUP,    // RBX[31:0]  -> XMM0[31:0]
//                               // RBX[31:0]  -> XMM0[63:32]
//                               // RBX[95:64] -> XMM0[95:64]
//                               // RBX[95:64] -> XMM0[127:96]
//
//  XMM2  XMM0  VMOVSLDUP,       // XMM2[31:0]  -> XMM0[31:0]
//                               // XMM2[31:0]  -> XMM0[63:32]
//                               // XMM2[95:64] -> XMM0[95:64]
//                               // XMM2[95:64] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  VMOVSLDUP, // XMM0[31:0]  -> XMM2[31:0]
//                               // XMM0[31:0]  -> XMM2[63:32]
//                               // XMM0[95:64] -> XMM2[95:64]
//                               // XMM0[95:64] -> XMM2[127:96]
//
//  YMM0  YMM8  VMOVSLDUP,       // YMM0[31:0]    -> YMM8[31:0]
//                               // YMM0[31:0]    -> YMM8[63:32]
//                               // YMM0[95:64]   -> YMM8[95:64]
//                               // YMM0[95:64]   -> YMM8[127:96]
//                               // YMM0[159:128] -> YMM8[159:128]
//                               // YMM0[159:128] -> YMM8[191:160]
//                               // YMM0[223:192] -> YMM8[223:192]
//                               // YMM0[223:192] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsscomma ( MOVSS, )
//
// C prototype:
//  void dg_forthmovsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVSS instruction. This sequence copies the single precision
//   floating point value in the lower 32 bits of the source to the lower
//   32 bits of the destination. If the source is memory and the destination is
//   an XMM register, the value is zero extended to 128 bits when it is copied
//   the XMM register. If both the source and destination are XMM registers, the
//   upper bits of the destination are unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVSS,       // RBX[31:0] -> XMM0[31:0]
//                              // 0 -> XMM0[127:32]
//
//  XMM0  RBX [R]  MOVSS,       // XMM0[31:0] -> RBX[31:0]
//
//  XMM2  XMM0  MOVSS,          // XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM0  MOVSS,    // XMM0[31:0] -> XMM2[31:0]
//
//  XMM0  XMM8  MOVSS,          // XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. Either target can be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovupdcomma ( MOVUPD, )
//
// C prototype:
//  void dg_forthmovupdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVUPD instruction. Intel docs say this sequence copies 2 double
//   precision floating point values from the source to the destination;
//   however, in 32 and 64 bit address mode, this is probably just a 128 bit
//   move and any 128 value will work.
//   Memory targets do NOT need to be aligned to a boundry for this instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVUPD,     // RBX[127:0] -> XMM0[127:0]
//
//  XMM0  RBX [R]  MOVUPD,     // XMM0[127:0] -> RBX[127:0]
//
//  XMM2  XMM0  MOVUPD,        // XMM2[127:0] -> XMM0[127:0]
//
//  XMM2 <-  XMM0  MOVUPD,  // XMM0[127:0] -> XMM2[127:0]
//
//  XMM0  XMM8  MOVUPD,        // XMM0[127:0] -> XMM8[127:0]
//
// Note:
//  Only 1 target can be a memory target. Either target can be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovupscomma ( MOVUPS, )
//
// C prototype:
//  void dg_forthmovupscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVUPS instruction. Intel docs say this sequence copies 4 single
//   precision floating point values from the source to the destination;
//   however, in 32 and 64 bit address mode, this is probably just a 128 bit
//   move and any 128 bit value will work.
//   Memory targets do NOT need to be aligned to a boundry for this instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVUPS,     // RBX[127:0] -> XMM0[127:0]
//
//  XMM0  RBX [R]  MOVUPS,     // XMM0[127:0] -> RBX[127:0]
//
//  XMM2  XMM0  MOVUPS,        // XMM2[127:0] -> XMM0[127:0]
//
//  XMM2 <-  XMM0  MOVUPS,  // XMM0[127:0] -> XMM2[127:0]
//
//  XMM0  XMM8  MOVUPS,        // XMM0[127:0] -> XMM8[127:0]
//
// Note:
//  Only 1 target can be a memory target. Either target can be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmulpdcomma ( MULPD, )
//
// C prototype:
//  void dg_forthmulpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MULPD instruction. This sequence multiplies each double precision
//   floating point value in the destination by the corresponding double
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MULPD,     // XMM0[63:0] * [RBX][63:0] -> XMM0[63:0]
//                            // XMM0[127:64] * [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM0  MULPD,        // XMM0[63:0] * XMM2[63:0] -> XMM0[63:0]
//                            // XMM0[127:64] * XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM0  MULPD,  // XMM2[63:0] * XMM0[63:0] -> XMM2[63:0]
//                            // XMM2[127:64] * XMM0[127:64] -> XMM2[127:64]
//
//  XMM0  XMM8  MULPD,        // XMM8[63:0] * XMM0[63:0] -> XMM8[63:0]
//                            // XMM8[127:64] * XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmulpdcomma ( VMULPD, )
//
// C prototype:
//  void dg_forthvmulpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMULPD instruction. This sequence multiplies each double precision
//   floating point value in target y by the corresponding double
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMULPD,  
//                            // XMM1[63:0] * [RBX][63:0] -> XMM0[63:0]
//                            // XMM1[127:64] * [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VMULPD, // XMM1[63:0] * XMM2[63:0] -> XMM0[63:0]
//                            // XMM1[127:64] * XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM1  XMM0  VMULPD,  
//                            // XMM1[63:0] * XMM0[63:0] -> XMM2[63:0]
//                            // XMM1[127:64] * XMM0[127:64] -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VMULPD, // YMM1[63:0] * YMM0[63:0] -> YMM8[63:0]
//                            // YMM1[127:64] * YMM0[127:64] -> YMM8[127:64]
//                            // YMM1[191:128] * YMM0[191:128] -> YMM8[191:128]
//                            // YMM1[255:192] * YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmulpscomma ( MULPS, )
//
// C prototype:
//  void dg_forthmulpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MULPS instruction. This sequence multiplies each single precision
//   floating point value in the destination by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MULPS,     // XMM0[31:0] * [RBX][63:0] -> XMM0[31:0]
//                            // XMM0[63:32] * [RBX][63:32] -> XMM0[63:32]
//                            // XMM0[95:64] * [RBX][95:64] -> XMM0[95:64]
//                            // XMM0[127:96] * [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  MULPS,        // XMM0[31:0] * XMM2[31:0] -> XMM0[31:0]
//                            // XMM0[63:32] * XMM2[63:32] -> XMM0[63:32]
//                            // XMM0[95:64] * XMM2[95:64] -> XMM0[95:64]
//                            // XMM0[127:96] * XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  MULPS,  // XMM2[31:0] * XMM0[31:0] -> XMM2[31:0]
//                            // XMM2[63:32] * XMM0[63:32] -> XMM2[63:32]
//                            // XMM2[95:64] * XMM0[95:64] -> XMM2[95:64]
//                            // XMM2[127:96] * XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM8  MULPS,        // XMM8[31:0] * XMM0[31:0] -> XMM8[31:0]
//                            // XMM8[63:32] * XMM0[63:32] -> XMM8[63:32]
//                            // XMM8[95:64] * XMM0[95:64] -> XMM8[95:64]
//                            // XMM8[127:96] * XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmulpscomma ( VMULPS, )
//
// C prototype:
//  void dg_forthvmulpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMULPS instruction. This sequence multiplies each single precision
//   floating point value in target y by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMULPS,     
//                            // XMM1[31:0]   * [RBX][63:0]   -> XMM0[31:0]
//                            // XMM1[63:32]  * [RBX][63:32]  -> XMM0[63:32]
//                            // XMM1[95:64]  * [RBX][95:64]  -> XMM0[95:64]
//                            // XMM1[127:96] * [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VMULPS, // XMM1[31:0]   * XMM2[31:0]   -> XMM0[31:0]
//                            // XMM1[63:32]  * XMM2[63:32]  -> XMM0[63:32]
//                            // XMM1[95:64]  * XMM2[95:64]  -> XMM0[95:64]
//                            // XMM1[127:96] * XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VMULPS,  
//                            // XMM1[31:0]   * XMM0[31:0]   -> XMM2[31:0]
//                            // XMM1[63:32]  * XMM0[63:32]  -> XMM2[63:32]
//                            // XMM1[95:64]  * XMM0[95:64]  -> XMM2[95:64]
//                            // XMM1[127:96] * XMM0[127:96] -> XMM2[127:96]
//
//  YMM0  YMM1  YMM8  VMULPS, // YMM1[31:0]    * YMM0[31:0]    -> YMM8[31:0]
//                            // YMM1[63:32]   * YMM0[63:32]   -> YMM8[63:32]
//                            // YMM1[95:64]   * YMM0[95:64]   -> YMM8[95:64]
//                            // YMM1[127:96]  * YMM0[127:96]  -> YMM8[127:96]
//                            // YMM1[159:128] * YMM0[159:128] -> YMM8[159:128]
//                            // YMM1[191:160] * YMM0[191:160] -> YMM8[191:160]
//                            // YMM1[223:192] * YMM0[223:192] -> YMM8[223:192]
//                            // YMM1[255:224] * YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmulsdcomma ( MULSD, )
//
// C prototype:
//  void dg_forthmulsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MULSD instruction. This sequence multiplies the double precision
//   floating point value in the lower 64 bits of the destination by the double
//   precision floating point value in the lower 64 bits of the source and
//   stores the result to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MULSD,     // XMM0[63:0] * [RBX][63:0] -> XMM0[63:0]
//
//  XMM2  XMM0  MULSD,        // XMM0[63:0] * XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM0  MULSD,  // XMM2[63:0] * XMM0[63:0] -> XMM2[63:0]
//
//  XMM0  XMM8  MULSD,        // XMM8[63:0] * XMM0[63:0] -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmulsdcomma ( VMULSD, )
//
// C prototype:
//  void dg_forthvmulsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMULSD instruction. This sequence multiplies the double precision
//   floating point value in the lower 64 bits of target y by the double
//   precision floating point value in the lower 64 bits of the source and
//   stores the result to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMULSD,     // XMM1[63:0] * [RBX][63:0] -> XMM0[63:0]
//
//  XMM2  XMM1  XMM0  VMULSD,        // XMM1[63:0] * XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM1  XMM0  VMULSD,  // XMM1[63:0] * XMM0[63:0] -> XMM2[63:0]
//
//  XMM0  XMM1  XMM8  VMULSD,        // XMM1[63:0] * XMM0[63:0] -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmulsscomma ( MULSS, )
//
// C prototype:
//  void dg_forthmulsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MULSS instruction. This sequence multiplies each single precision
//   floating point value in the destination by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MULSS,     // XMM0[31:0] * [RBX][31:0] -> XMM0[31:0]
//
//  XMM2  XMM0  MULSS,        // XMM0[31:0] * XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM0  MULSS,  // XMM2[31:0] * XMM0[31:0] -> XMM2[31:0]
//
//  XMM0  XMM8  MULSS,        // XMM8[31:0] * XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmulsscomma ( VMULSS, )
//
// C prototype:
//  void dg_forthvmulsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMULSS instruction. This sequence multiplies each single precision
//   floating point value in target y by the corresponding single
//   precision floating point value in the source and stores the results
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMULSS,     // XMM1[31:0] * [RBX][31:0] -> XMM0[31:0]
//
//  XMM2  XMM1  XMM0  VMULSS,        // XMM1[31:0] * XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM1  XMM0  VMULSS,  // XMM1[31:0] * XMM0[31:0] -> XMM2[31:0]
//
//  XMM0  XMM1  XMM8  VMULSS,        // XMM1[31:0] * XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthleacomma ( LEA, )
//
// C prototype:
//  void dg_forthleacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the otherbufferhandles
//                                         are stored.
//
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   currentcompilebufferoffset [O]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LEA instruction. The effective address of the memory target is
//   loaded into the register target. For example, if the memory target is
//   [R+N], then R+N is loaded into the register target.
//
// Note:
//  If you specify a register and a memory target, it doesn't matter which one
//   is target x or target y or what direction is used. The memory target's
//   effective address is calculated and the register target receives the
//   result.
//  If you specify two register targets, target x is the effective address and
//   target y is the destination; however, this will generate an invalid opcode
//   exception.
//  If you specify two register targets along with <- then target y is the
//   effective address and target x is the destination;  however this will
//   generate an invalid opcode exception.
// '
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  EBX 8 [R+N]  EAX  LEA,  // essentially puts EBX + 8 into EAX
//
// 64 bit address mode example:
//  R9 [R]       RCX  LEA,  // essentially R9 RCX -> MOV,
//  R9 3 [R+N]   RDX  LEAX, // essentially puts R9 + 3 into RDX
//  How to get the address of some data in your program relative to the current
//   instruction:
//    RIP displacement [R+N]  RAX  MOV,
//  Another way to get the address of some data in your program relative to
//   the current instruction: (this way you don't have to manually calculate
//   the displacement)
//    currentcompilebufferoffset [O]  RAX  MOV,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthleavecomma ( LEAVE, )
//
// C prototype:
//  void dg_forthleavecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 LEAVE instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  LEAVE,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlfencecomma ( LFENCE, )
//
// C prototype:
//  void dg_forthlfencecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that makes sure all previous instructions complete and then
//   makes sure this instruction completes before any instruction after this
//   one start.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlgdtcomma ( LGDT, )
//
// C prototype:
//  void dg_forthlgdtcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 LGDT instruction. In 32 bit mode, this opcode sequence copies a
//   48 bit value from the source to the GDTR register. In 64 bit mode, this
//   opcode sequence copies an 80 bit value to the GDTR register. The value
//   copied is an address plus a 16 bit length in bytes.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  LGDT,          //  [RAX] -> GDTR
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlidtcomma ( LIDT, )
//
// C prototype:
//  void dg_forthlidtcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 LIDT instruction. In 32 bit mode, this opcode sequence copies a
//   48 bit value from the source to the IDTR register. In 64 bit mode, this
//   opcode sequence copies an 80 bit value to the IDTR register. The value
//   copied is an address plus a 16 bit length in bytes.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  LIDT,          //  [RAX] -> IDTR
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlldtcomma ( LLDT, )
//
// C prototype:
//  void dg_forthlldtcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 LLDT instruction. This opcode loads the 16 bit value from the
//   source into the LDTR register. The LDTR register is the local table
//   descriptor register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  LLDT,             // [RAX] -> LDTR
//
// Note:
//  Data size is not required for this instruction and is ignored except
//   for specifying a data size of 64 bits. If you specify 64BIT you
//   will get the REX.W prefix. Intel docs don't say what will happen if
//   there is a REX.W prefix but it will probably be ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlmswcomma ( LMSW, )
//
// C prototype:
//  void dg_forthlmswcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 LMSW instruction. This opcode loads the 16 bit value from the
//   source into the machine status word bits of register CR0.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  LMSW,             // [RAX] -> CR0[15:0]
//
// Note:
//  Data size is not required for this instruction and is ignored except
//   for specifying a data size of 64 bits. If you specify 64BIT you
//   will get the REX.W prefix. Intel docs don't say what will happen if
//   there is a REX.W prefix but it will probably be ignored.
//
//  Intel docs say something about not using this instruction on newer
//   processors and prefers you use the MOVCR instruction instead.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlockcomma ( LOCK, )
//
// C prototype:
//  void dg_forthlockcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for an x86 LOCK instruction prefix.
//  You probably won't need to use this on the newer processors.'
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  LOCK,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlodsbcomma ( LODSB, )
//
// C prototype:
//  void dg_forthlodsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 LODSB instruction.
//  In 32 bit addressing mode:
//   Loads the 8 bit value in [ESI] to AL, and decrements ECX.
//   ESI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Loads the 8 bit value in [RSI] to AL, and decrements RCX.
//   RSI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlodsdcomma ( LODSD, )
//
// C prototype:
//  void dg_forthlodsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 LODSD instruction.
//  In 32 bit addressing mode:
//   Loads the 32 bit value in [ESI] to EAX, and decrements ECX.
//   ESI is adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//  In 64 bit addressing mode:
//   Loads the 32 bit value in [RSI] to EAX, and decrements RCX.
//   RSI is adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlodsqcomma ( LODSQ, )
//
// C prototype:
//  void dg_forthlodsqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 LODSD instruction.
//  Not supported in 32 bit addressing mode.
//  In 64 bit addressing mode:
//   Loads the 64 bit value in [RSI] to RAX, and decrements RCX.
//   RSI is adjusted according to the direction flag,
//    clear adds 8 and set subtracts 8.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlodswcomma ( LODSW, )
//
// C prototype:
//  void dg_forthlodswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 LODSW instruction.
//  In 32 bit addressing mode:
//   Copies the 16 bit value from [ESI] to AX, and decrements ECX.
//   ESI is adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//  In 64 bit addressing mode:
//   Copies the 16 bit value from [RSI] to AX, and decrements RCX.
//   RSI is adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlslcomma ( LSL, )
//
// C prototype:
//  void dg_forthlslcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LSL instruction. This opcode sequence loads the unscrambled
//   segment limit from the segment descriptor specified by the source to the
//   destination. The zero flag is set.

//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   CX   LSL,
//  CX        AX   LSL,
//
// Note:
//  Data size is ignored. Docs show a rex.w version of this instruction but
//   it looks like the result will be the same and the upper 32 bits of
//   the register get cleared with or without rex.w... this is because if
//   you write to a 32 bit register in 64 bit mode the upper 32 bits are
//   cleared anyways. The only question is what happens to bits 16 to 31...
//   In any case I didn't include support for the rex.w version. If you
//   want rex.w, compile a 0x48 just before using this compiling word.
//  Reverse is not supported.
//  The destination must be a register target.
//  The source can be a memory or register target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthltrcomma ( LTR, )
//
// C prototype:
//  void dg_forthltrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 LTR instruction. This opcode loads the 16 bit value from the
//   source into the task register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  LMSW,             // [RAX] -> CR0[15:0]
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlzcntcomma ( LZCNT, )
//
// C prototype:
//  void dg_forthlzcntcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  2, 4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 XADD instruction. This opcode sequence counts the leading zeroes
//   in the source and puts the count into the destination.
//   The source can be a register or a value in memory. The destination must
//   be a register. The sizes of the source and destination must be the same.
//   The size can be 2, 4, or 8 bytes.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//   RAX [R]  CX  LZCNT,   // countofleadingzeros( [RAX][15:0] ) -> CX
//
//   RAX [R]  ECX  LZCNT,  // countofleadingzeros( [RAX][31:0] ) -> ECX
//
//   RAX [R]  RCX  LZCNT,  // countofleadingzeros( [RAX][63:0] ) -> RCX
//
//   AX  CX  LZCNT,        // countofleadingzeros( AX ) -> CX
//
//   EAX  ECX  LZCNT,      // countofleadingzeros( EAX ) -> ECX
//
//   RAX  RCX  LZCNT,      // countofleadingzeros( RAX ) -> RCX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmaskmovdqucomma ( MASKMOVDQU, )
//
// C prototype:
//  void dg_forthmaskmovdqucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( sourceparemeterlist maskparameterlist -- )
//
// Data stack in:
//
//  sourceparemeterlist
//  selectorparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MASKMOVDQU instruction. This opcode sequence copies selected
//   bytes from the source xmm point register to a 128 bit memory location at
//   the address in EDI/RDI. The bytes copied are chosen using the value in
//   the mask floating point register. If the bit 7 of the byte in the mask
//   is set, then the corresponding byte in the source is copied. Otherwise
//   the byte in the destination memory at the address in EDI/RDI is left
//   unchanged.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit example:
//  XMM0 XMM1  MASKMOVDQU, // if XMM1[7]   = 1 then XMM0[7:0]   -> [RDI][7:0]
//                         // if XMM1[15]  = 1 then XMM0[15:8]  -> [RDI][15:8]
//                         // if XMM1[23]  = 1 then XMM0[23:16] -> [RDI][23:16]
//                         // if XMM1[31]  = 1 then XMM0[31:24] -> [RDI][31:24]
//                         // if XMM1[39]  = 1 then XMM0[39:32] -> [RDI][39:32]
//                         // if XMM1[47]  = 1 then XMM0[47:40] -> [RDI][47:40]
//                         // if XMM1[55]  = 1 then XMM0[55:48] -> [RDI][55:48]
//                         // if XMM1[63]  = 1 then XMM0[63:56] -> [RDI][63:56]
//                         // if XMM1[71]  = 1 then XMM0[63:56] -> [RDI][71:64]
//                         // if XMM1[79]  = 1 then XMM0[63:56] -> [RDI][79:72]
//                         // if XMM1[87]  = 1 then XMM0[63:56] -> [RDI][87:80]
//                         // if XMM1[95]  = 1 then XMM0[63:56] -> [RDI][95:88]
//                         // if XMM1[103] = 1 then XMM0[63:56] -> [RDI][103:96]
//                         // if XMM1[111] = 1 then XMM0[63:56] -> [RDI][111:104]
//                         // if XMM1[119] = 1 then XMM0[63:56] -> [RDI][119:112]
//                         // if XMM1[127] = 1 then XMM0[63:56] -> [RDI][127:120]
//
// Note:
//  There is something in the Intel docs about caching in a multiprocessor
//   environmnet. If multiple processors are going to be using the destination
//   memory, you might want to read it.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmaskmovdqucomma ( VMASKMOVDQU, )
//
// C prototype:
//  void dg_forthvmaskmovdqucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( maskparameterlist sourceparemeterlist -- )
//
// Data stack in:
//
//  sourceparemeterlist
//  selectorparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMASKMOVDQU instruction. This opcode sequence copies selected
//   bytes from the source xmm point register to a 128 bit memory location at
//   the address in EDI/RDI. The bytes copied are chosen using the value in
//   the mask floating point register. If the bit 7 of the byte in the mask
//   is set, then the corresponding byte in the source is copied. Otherwise
//   the byte in the destination memory at the address in EDI/RDI is left
//   unchanged.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit example:
//  XMM0 XMM1  VMASKMOVDQU, // if XMM1[7]   = 1 then XMM0[7:0]   -> [RDI][7:0]
//                          // if XMM1[15]  = 1 then XMM0[15:8]  -> [RDI][15:8]
//                          // if XMM1[23]  = 1 then XMM0[23:16] -> [RDI][23:16]
//                          // if XMM1[31]  = 1 then XMM0[31:24] -> [RDI][31:24]
//                          // if XMM1[39]  = 1 then XMM0[39:32] -> [RDI][39:32]
//                          // if XMM1[47]  = 1 then XMM0[47:40] -> [RDI][47:40]
//                          // if XMM1[55]  = 1 then XMM0[55:48] -> [RDI][55:48]
//                          // if XMM1[63]  = 1 then XMM0[63:56] -> [RDI][63:56]
//                          // if XMM1[71]  = 1 then XMM0[63:56] -> [RDI][71:64]
//                          // if XMM1[79]  = 1 then XMM0[63:56] -> [RDI][79:72]
//                          // if XMM1[87]  = 1 then XMM0[63:56] -> [RDI][87:80]
//                          // if XMM1[95]  = 1 then XMM0[63:56] -> [RDI][95:88]
//                          // if XMM1[103] = 1 then XMM0[63:56] -> [RDI][103:96]
//                          // if XMM1[111] = 1 then XMM0[63:56] -> [RDI][111:104]
//                          // if XMM1[119] = 1 then XMM0[63:56] -> [RDI][119:112]
//                          // if XMM1[127] = 1 then XMM0[63:56] -> [RDI][127:120]
//
// Note:
//  There is something in the Intel docs about caching in a multiprocessor
//   environmnet. If multiple processors are going to be using the destination
//   memory, you might want to read it.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmaskmovqcomma ( MASKMOVQ, )
//
// C prototype:
//  void dg_forthmaskmovqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( sourceparemeterlist maskparameterlist -- )
//
// Data stack in:
//
//  sourceparemeterlist
//  selectorparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//  floatingpointregister         one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MASKMOVQ instruction. This opcode sequence copies selected bytes
//   from the source floating point register to a 64 bit memory location at
//   the address in EDI/RDI. The bytes copied are chosen using the value in
//   the mask floating point register. If the bit 7 of the byte in the mask
//   is set, then the corresponding byte in the source is copied. Otherwise
//   the byte in the destination memory at the address in EDI/RDI is left
//   unchanged.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit example:
//  ST0  ST1  MASKMOVQ,  // if ST1[7]  = 1 then ST0[7:0]   -> [RDI][7:0]
//                       // if ST1[15] = 1 then ST0[15:8]  -> [RDI][15:8]
//                       // if ST1[23] = 1 then ST0[23:16] -> [RDI][23:16]
//                       // if ST1[31] = 1 then ST0[31:24] -> [RDI][31:24]
//                       // if ST1[39] = 1 then ST0[39:32] -> [RDI][39:32]
//                       // if ST1[47] = 1 then ST0[47:40] -> [RDI][47:40]
//                       // if ST1[55] = 1 then ST0[55:48] -> [RDI][55:48]
//                       // if ST1[63] = 1 then ST0[63:56] -> [RDI][63:56]
//
// Note:
//  There is something in the Intel docs about caching in a multiprocessor
//   environmnet. If multiple processors are going to be using the destination
//   memory, you might want to read it.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmaxpdcomma ( MAXPD, )
//
// C prototype:
//  void dg_forthmaxpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MAXPD instruction. This sequence compares each double precision
//   floating point value in the destination with the corresponding double
//   precision floating point value in the source and stores the maximum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MAXPD,     // if XMM0[63:0] <= [RBX][63:0] then
//                            //  RBX[63:0] -> XMM0[63:0]
//                            // if XMM0[127:64] <= [RBX][127:64] then
//                             // [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM0  MAXPD,        // if XMM0[63:0] <= XMM2[63:0] then
//                            //  XMM2[63:0] -> XMM0[63:0]
//                            // if XMM0[127:64] <= XMM2[127:64] then
//                            //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM0  MAXPD,  // if XMM2[63:0] <= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM2[63:0]
//                            // if XMM2[127:64] <= XMM0[127:64] then
//                            //  XMM0[127:64] -> XMM2[127:64]
//
//  XMM0  XMM8  MAXPD,        // if XMM8[63:0] <= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM8[63:0]
//                            // if XMM8[127:64] <= XMM0[127:64] then
//                            //  XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmaxpdcomma ( VMAXPD, )
//
// C prototype:
//  void dg_forthvmaxpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMAXPD instruction. This sequence compares each double precision
//   floating point value in target y with the corresponding double
//   precision floating point value in the source and stores the maximum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMAXPD,  // if XMM1[63:0] <= [RBX][63:0] then
//                                //  RBX[63:0] -> XMM0[63:0]
//                                // else
//                                //  XMM1[63:0] -> XMM0[63:0]
//
//                                // if XMM1[127:64] <= [RBX][127:64] then
//                                //  [RBX][127:64] -> XMM0[127:64]
//                                // else
//                                //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VMAXPD, // if XMM1[63:0] <= XMM2[63:0] then
//                            //  XMM2[63:0] -> XMM0[63:0]
//                            // else
//                            //  XMM1[63:0] -> XMM0[63:0]
//
//                            // if XMM1[127:64] <= XMM2[127:64] then
//                            //  XMM2[127:64] -> XMM0[127:64]
//                            // else
//                            //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM1  XMM0  VMAXPD, 
//                            // if XMM2[63:0] <= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM2[63:0]
//                            // else
//                            //  XMM1[63:0] -> XMM2[63:0]
//
//                            // if XMM2[127:64] <= XMM0[127:64] then
//                            //  XMM0[127:64] -> XMM2[127:64]
//                            // else
//                            //  XMM1[127:64] -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VMAXPD, // if YMM8[63:0] <= YMM0[63:0] then
//                            //  YMM0[63:0] -> YMM8[63:0]
//                            // else
//                            //  YMM1[63:0] -> YMM2[63:0]
//
//                            // if YMM8[127:64] <= YMM0[127:64] then
//                            //  YMM0[127:64] -> YMM8[127:64]
//                            // else
//                            //  YMM1[127:64] -> YMM2[127:64]
//
//                            // if YMM8[191:128] <= YMM0[191:128] then
//                            //  YMM0[191:128] -> YMM8[191:128]
//                            // else
//                            //  YMM1[191:128] -> YMM2[191:128]
//
//                            // if YMM8[255:192] <= YMM0[255:192] then
//                            //  YMM0[255:192] -> YMM8[255:192]
//                            // else
//                            //  YMM1[255:192] -> YMM2[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmaxpscomma ( MAXPS, )
//
// C prototype:
//  void dg_forthmaxpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MAXPS instruction. This sequence compares each single precision
//   floating point value in the destination with the corresponding single
//   precision floating point value in the source and stores the maximum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MAXPS,     // if XMM0[31:0] <= [RBX][31:0] then
//                            //  then [RBX][31:0] -> XMM0[31:0]
//                            // if XMM0[63:32] <= [RBX][63:32] then
//                            //  then [RBX][63:32] -> XMM0[63:32]
//                            // if XMM0[95:64] <= [RBX][95:64] then
//                            //  then [RBX][95:64] -> XMM0[95:64]
//                            // if XMM0[127:96] <= [RBX][127:96] then
//                            //  then [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  MAXPS,        // if XMM0[31:0] <= XMM2[31:0] then
//                            //  then XMM2[31:0] -> XMM0[31:0]
//                            // if XMM0[63:32] <= XMM2[63:32] then
//                            //  then XMM2[63:32] -> XMM0[63:32]
//                            // if XMM0[95:64] <= XMM2[95:64] then
//                            //  then XMM2[95:64] -> XMM0[95:64]
//                            // if XMM0[127:96] <= XMM2[127:96] then
//                            //  then XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  MAXPS,  // if XMM2[31:0] <= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM2[31:0]
//                            // if XMM2[63:32] <= XMM0[63:32] then
//                            //  XMM0[63:32] -> XMM2[63:32]
//                            // if XMM2[95:64] <= XMM0[95:64] then
//                            //  XMM0[95:64] -> XMM2[95:64]
//                            // if XMM2[127:96] <= XMM0[127:96] then
//                            //  XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM8  MAXPS,        // if XMM8[31:0] <= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM8[31:0]
//                            // if XMM8[63:32] <= XMM0[63:32] then
//                            //  XMM0[63:32] -> XMM8[63:32]
//                            // if XMM8[95:64] <= XMM0[95:64] then
//                            //  XMM0[95:64] -> XMM8[95:64]
//                            // if XMM8[127:96] <= XMM0[127:96] then
//                            //  XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmaxpscomma ( VMAXPS, )
//
// C prototype:
//  void dg_forthvmaxpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMAXPS instruction. This sequence compares each single precision
//   floating point value in target y with the corresponding single
//   precision floating point value in the source and stores the maximum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMAXPS,  // if XMM1[31:0] <= [RBX][31:0] then
//                                //  [RBX][31:0] -> XMM0[31:0]
//                                // else
//                                //  XMM1[31:0] -> XMM0[31:0]
//
//                                // if XMM1[63:32] <= [RBX][63:32] then
//                                //  [RBX][63:32] -> XMM0[63:32]
//                                // else
//                                //  XMM1[63:32] -> XMM0[63:32]
//
//                                // if XMM1[95:64] <= [RBX][95:64] then
//                                //  [RBX][95:64] -> XMM0[95:64]
//                                // else
//                                //  XMM1[95:64] -> XMM0[95:64]
//
//                                // if XMM1[127:96] <= [RBX][127:96] then
//                                //  [RBX][127:96] -> XMM0[127:96]
//                                // else
//                                //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VMAXPS,     // if XMM1[31:0] <= XMM2[31:0] then
//                                //  XMM2[31:0] -> XMM0[31:0]
//                                // else
//                                //  XMM1[31:0] -> XMM0[31:0]
//
//                                // if XMM1[63:32] <= XMM2[63:32] then
//                                //  XMM2[63:32] -> XMM0[63:32]
//                                // else
//                                //  XMM1[63:32] -> XMM0[63:32]
//
//                                // if XMM1[95:64] <= XMM2[95:64] then
//                                //  XMM2[95:64] -> XMM0[95:64]
//                                // else
//                                //  XMM1[95:64] -> XMM0[95:64]
//
//                                // if XMM1[127:96] <= XMM2[127:96] then
//                                //  XMM2[127:96] -> XMM0[127:96]
//                                // else
//                                //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VMAXPS, // if XMM1[31:0] <= XMM0[31:0] then
//                                  //  XMM0[31:0] -> XMM2[31:0]
//                                  // else
//                                  //  XMM1[31:0] -> XMM0[31:0]
//
//                                  // if XMM1[63:32] <= XMM0[63:32] then
//                                  //  XMM0[63:32] -> XMM2[63:32]
//                                  // else
//                                  //  XMM1[63:32] -> XMM0[63:32]
//
//                                  // if XMM1[95:64] <= XMM0[95:64] then
//                                  //  XMM0[95:64] -> XMM2[95:64]
//                                  // else
//                                  //  XMM1[95:64] -> XMM0[95:64]
//
//                                  // if XMM1[127:96] <= XMM0[127:96] then
//                                  //  XMM0[127:96] -> XMM2[127:96]
//                                  // else
//                                  //  tXMM1[127:96] -> XMM0[127:96]
//
//  YMM0  YMM1  YMM8  VMAXPS, // if YMM1[31:0] <= YMM0[31:0] then
//                            //  YMM0[31:0] -> YMM8[31:0]
//                            // else
//                            //  YMM1[31:0] -> YMM8[31:0]

//                            // if YMM1[63:32] <= YMM0[63:32] then
//                            //  YMM0[63:32] -> YMM8[63:32]
//                            // else
//                            //  YMM1[63:32] -> YMM8[63:32]
//
//                            // if YMM1[95:64] <= YMM0[95:64] then
//                            //  YMM0[95:64] -> YMM8[95:64]
//                            // else
//                            //  YMM1[95:64] -> YMM8[95:64]
//
//                            // if YMM1[127:96] <= YMM0[127:96] then
//                            //  YMM0[127:96] -> YMM8[127:96]
//                            // else
//                            //  YMM1[127:96] -> YMM8[127:96]
//
//                            // if YMM1[159:128] <= YMM0[159:128] then
//                            //  YMM0[159:128] -> YMM8[159:128]
//                            // else
//                            //  YMM1[159:128] -> YMM8[159:128]
//
//                            // if YMM1[191:160] <= YMM0[191:160] then
//                            //  YMM0[191:160] -> YMM8[191:160]
//                            // else
//                            //  YMM1[191:160] -> YMM8[191:160]
//
//                            // if YMM1[223:192] <= YMM0[223:192] then
//                            //  YMM0[223:192] -> YMM8[223:192]
//                            // else
//                            //  YMM1[223:192] -> YMM8[223:192]
//
//                            // if YMM1[255:224] <= YMM0[255:224] then
//                            //  YMM0[255:224] -> YMM8[255:224]
//                            // else
//                            //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmaxsdcomma ( MAXSD, )
//
// C prototype:
//  void dg_forthmaxsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MAXSD instruction. This sequence compares the double precision
//   floating point value in the lower 64 bits of the destination with the
//   corresponding double precision floating point value in the lower 64 bits of
//   the source and stores the maximum to the lower 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MAXSD,     // if XMM0[63:0] <= [RBX][63:0] then
//                            //  RBX[63:0] -> XMM0[63:0]
//
//  XMM2  XMM0  MAXSD,        // if XMM0[63:0] <= XMM2[63:0] then
//                            //  XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM0  MAXSD,  // if XMM2[63:0] <= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM2[63:0]
//
//  XMM0  XMM8  MAXSD,        // if XMM8[63:0] <= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmaxsdcomma ( VMAXSD, )
//
// C prototype:
//  void dg_forthvmaxsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter lists for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter lists for target x and target z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMAXSD instruction. This sequence compares the double precision
//   floating point value in the lower 64 bits of target y with the
//   corresponding double precision floating point value in the lower 64 bits of
//   the source and stores the maximum to the lower 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMAXSD,     // if XMM1[63:0] <= [RBX][63:0] then
//                                   //  RBX[63:0] -> XMM0[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
//  XMM2  XMM1  XMM0  VMAXSD,        // if XMM1[63:0] <= XMM2[63:0] then
//                                   //  XMM2[63:0] -> XMM0[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM1  XMM0  VMAXSD,  // if XMM1[63:0] <= XMM0[63:0] then
//                                   //  XMM0[63:0] -> XMM2[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
//  XMM0  XMM1  XMM8  VMAXSD,        // if XMM1[63:0] <= XMM0[63:0] then
//                                   //  XMM0[63:0] -> XMM8[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmaxsscomma ( MAXSS, )
//
// C prototype:
//  void dg_forthmaxsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MAXSS instruction. This sequence compares the single precision
//   floating point value in the lower 32 bits of the destination with the
//   corresponding single precision floating point value in the lower 32 bits of
//   the source and stores the maximum to the lower 32 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MAXSS,     // if XMM0[31:0] <= [RBX][31:0] then
//                            //  then [RBX][31:0] -> XMM0[31:0]
//
//  XMM2  XMM0  MAXSS,        // if XMM0[31:0] <= XMM2[31:0] then
//                            //  then XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM0  MAXSS,  // if XMM2[31:0] <= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM2[31:0]
//
//  XMM0  XMM8  MAXSS,        // if XMM8[31:0] <= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmaxsscomma ( VMAXSS, )
//
// C prototype:
//  void dg_forthvmaxsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for target x and target z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMAXSS instruction. This sequence compares the single precision
//   floating point value in the lower 32 bits of target y with the
//   corresponding single precision floating point value in the lower 32 bits of
//   the source and stores the maximum to the lower 32 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMAXSS,     // if XMM1[31:0] <= [RBX][31:0] then
//                                   //  then [RBX][31:0] -> XMM0[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
//  XMM2  XMM1  XMM0  VMAXSS,        // if XMM1[31:0] <= XMM2[31:0] then
//                                   //  then XMM2[31:0] -> XMM0[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM1  XMM0  VMAXSS,  // if XMM1[31:0] <= XMM0[31:0] then
//                                   //  XMM0[31:0] -> XMM2[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
//  XMM0  XMM1  XMM8  VMAXSS,        // if XMM1[31:0] <= XMM0[31:0] then
//                                   //  XMM0[31:0] -> XMM8[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmfencecomma ( MFENCE, )
//
// C prototype:
//  void dg_forthmfencecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 MFENCE instruction. This opcode sequence
//   makes sure all memory changes are globally visible before the next instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  MFENCE,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthminpdcomma ( MINPD, )
//
// C prototype:
//  void dg_forthminpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MINPD instruction. This sequence compares each double precision
//   floating point value in the destination with the corresponding double
//   precision floating point value in the source and stores the minimum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MINPD,     // if XMM0[63:0] >= [RBX][63:0] then
//                            //  RBX[63:0] -> XMM0[63:0]
//                            // if XMM0[127:64] >= [RBX][127:64] then
//                             // [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM0  MINPD,        // if XMM0[63:0] >= XMM2[63:0] then
//                            //  XMM2[63:0] -> XMM0[63:0]
//                            // if XMM0[127:64] >= XMM2[127:64] then
//                            //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM0  MINPD,  // if XMM2[63:0] >= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM2[63:0]
//                            // if XMM2[127:64] >= XMM0[127:64] then
//                            //  XMM0[127:64] -> XMM2[127:64]
//
//  XMM0  XMM8  MINPD,        // if XMM8[63:0] >= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM8[63:0]
//                            // if XMM8[127:64] >= XMM0[127:64] then
//                            //  XMM0[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvminpdcomma ( VMINPD, )
//
// C prototype:
//  void dg_forthvminpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMINPD instruction. This sequence compares each double precision
//   floating point value in target y with the corresponding double
//   precision floating point value in the source and stores the minimum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMINPD,  // if XMM1[63:0] >= [RBX][63:0] then
//                                //  RBX[63:0] -> XMM0[63:0]
//                                // else
//                                //  XMM1[63:0] -> XMM0[63:0]
//
//                                // if XMM1[127:64] >= [RBX][127:64] then
//                                //  [RBX][127:64] -> XMM0[127:64]
//                                // else
//                                //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VMINPD, // if XMM1[63:0] >= XMM2[63:0] then
//                            //  XMM2[63:0] -> XMM0[63:0]
//                            // else
//                            //  XMM1[63:0] -> XMM0[63:0]
//
//                            // if XMM1[127:64] >= XMM2[127:64] then
//                            //  XMM2[127:64] -> XMM0[127:64]
//                            // else
//                            //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <-  XMM1  XMM0  VMINPD, 
//                            // if XMM2[63:0] >= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM2[63:0]
//                            // else
//                            //  XMM1[63:0] -> XMM2[63:0]
//
//                            // if XMM2[127:64] >= XMM0[127:64] then
//                            //  XMM0[127:64] -> XMM2[127:64]
//                            // else
//                            //  XMM1[127:64] -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VMINPD, // if YMM8[63:0] >= YMM0[63:0] then
//                            //  YMM0[63:0] -> YMM8[63:0]
//                            // else
//                            //  YMM1[63:0] -> YMM2[63:0]
//
//                            // if YMM8[127:64] >= YMM0[127:64] then
//                            //  YMM0[127:64] -> YMM8[127:64]
//                            // else
//                            //  YMM1[127:64] -> YMM2[127:64]
//
//                            // if YMM8[191:128] >= YMM0[191:128] then
//                            //  YMM0[191:128] -> YMM8[191:128]
//                            // else
//                            //  YMM1[191:128] -> YMM2[191:128]
//
//                            // if YMM8[255:192] >= YMM0[255:192] then
//                            //  YMM0[255:192] -> YMM8[255:192]
//                            // else
//                            //  YMM1[255:192] -> YMM2[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcvttminpscomma ( MINPS, )
//
// C prototype:
//  void dg_forthminpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MINPS instruction. This sequence compares each single precision
//   floating point value in the destination with the corresponding single
//   precision floating point value in the source and stores the minimum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MINPS,     // if XMM0[31:0] >= [RBX][31:0] then
//                            //  then [RBX][31:0] -> XMM0[31:0]
//                            // if XMM0[63:32] >= [RBX][63:32] then
//                            //  then [RBX][63:32] -> XMM0[63:32]
//                            // if XMM0[95:64] >= [RBX][95:64] then
//                            //  then [RBX][95:64] -> XMM0[95:64]
//                            // if XMM0[127:96] >= [RBX][127:96] then
//                            //  then [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  MINPS,        // if XMM0[31:0] >= XMM2[31:0] then
//                            //  then XMM2[31:0] -> XMM0[31:0]
//                            // if XMM0[63:32] >= XMM2[63:32] then
//                            //  then XMM2[63:32] -> XMM0[63:32]
//                            // if XMM0[95:64] >= XMM2[95:64] then
//                            //  then XMM2[95:64] -> XMM0[95:64]
//                            // if XMM0[127:96] >= XMM2[127:96] then
//                            //  then XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM0  MINPS,  // if XMM2[31:0] >= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM2[31:0]
//                            // if XMM2[63:32] >= XMM0[63:32] then
//                            //  XMM0[63:32] -> XMM2[63:32]
//                            // if XMM2[95:64] >= XMM0[95:64] then
//                            //  XMM0[95:64] -> XMM2[95:64]
//                            // if XMM2[127:96] >= XMM0[127:96] then
//                            //  XMM0[127:96] -> XMM2[127:96]
//
//  XMM0  XMM8  MINPS,        // if XMM8[31:0] >= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM8[31:0]
//                            // if XMM8[63:32] >= XMM0[63:32] then
//                            //  XMM0[63:32] -> XMM8[63:32]
//                            // if XMM8[95:64] >= XMM0[95:64] then
//                            //  XMM0[95:64] -> XMM8[95:64]
//                            // if XMM8[127:96] >= XMM0[127:96] then
//                            //  XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvminpscomma ( VMINPS, )
//
// C prototype:
//  void dg_forthvminpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMINPS instruction. This sequence compares each single precision
//   floating point value in target y with the corresponding single
//   precision floating point value in the source and stores the minimum
//   to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMINPS,  // if XMM1[31:0] >= [RBX][31:0] then
//                                //  [RBX][31:0] -> XMM0[31:0]
//                                // else
//                                //  XMM1[31:0] -> XMM0[31:0]
//
//                                // if XMM1[63:32] >= [RBX][63:32] then
//                                //  [RBX][63:32] -> XMM0[63:32]
//                                // else
//                                //  XMM1[63:32] -> XMM0[63:32]
//
//                                // if XMM1[95:64] >= [RBX][95:64] then
//                                //  [RBX][95:64] -> XMM0[95:64]
//                                // else
//                                //  XMM1[95:64] -> XMM0[95:64]
//
//                                // if XMM1[127:96] >= [RBX][127:96] then
//                                //  [RBX][127:96] -> XMM0[127:96]
//                                // else
//                                //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VMINPS,     // if XMM1[31:0] >= XMM2[31:0] then
//                                //  XMM2[31:0] -> XMM0[31:0]
//                                // else
//                                //  XMM1[31:0] -> XMM0[31:0]
//
//                                // if XMM1[63:32] >= XMM2[63:32] then
//                                //  XMM2[63:32] -> XMM0[63:32]
//                                // else
//                                //  XMM1[63:32] -> XMM0[63:32]
//
//                                // if XMM1[95:64] >= XMM2[95:64] then
//                                //  XMM2[95:64] -> XMM0[95:64]
//                                // else
//                                //  XMM1[95:64] -> XMM0[95:64]
//
//                                // if XMM1[127:96] >= XMM2[127:96] then
//                                //  XMM2[127:96] -> XMM0[127:96]
//                                // else
//                                //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VMINPS, // if XMM1[31:0] >= XMM0[31:0] then
//                                  //  XMM0[31:0] -> XMM2[31:0]
//                                  // else
//                                  //  XMM1[31:0] -> XMM0[31:0]
//
//                                  // if XMM1[63:32] >= XMM0[63:32] then
//                                  //  XMM0[63:32] -> XMM2[63:32]
//                                  // else
//                                  //  XMM1[63:32] -> XMM0[63:32]
//
//                                  // if XMM1[95:64] >= XMM0[95:64] then
//                                  //  XMM0[95:64] -> XMM2[95:64]
//                                  // else
//                                  //  XMM1[95:64] -> XMM0[95:64]
//
//                                  // if XMM1[127:96] >= XMM0[127:96] then
//                                  //  XMM0[127:96] -> XMM2[127:96]
//                                  // else
//                                  //  tXMM1[127:96] -> XMM0[127:96]
//
//  YMM0  YMM1  YMM8  VMINPS, // if YMM1[31:0] >= YMM0[31:0] then
//                            //  YMM0[31:0] -> YMM8[31:0]
//                            // else
//                            //  YMM1[31:0] -> YMM8[31:0]

//                            // if YMM1[63:32] >= YMM0[63:32] then
//                            //  YMM0[63:32] -> YMM8[63:32]
//                            // else
//                            //  YMM1[63:32] -> YMM8[63:32]
//
//                            // if YMM1[95:64] >= YMM0[95:64] then
//                            //  YMM0[95:64] -> YMM8[95:64]
//                            // else
//                            //  YMM1[95:64] -> YMM8[95:64]
//
//                            // if YMM1[127:96] >= YMM0[127:96] then
//                            //  YMM0[127:96] -> YMM8[127:96]
//                            // else
//                            //  YMM1[127:96] -> YMM8[127:96]
//
//                            // if YMM1[159:128] >= YMM0[159:128] then
//                            //  YMM0[159:128] -> YMM8[159:128]
//                            // else
//                            //  YMM1[159:128] -> YMM8[159:128]
//
//                            // if YMM1[191:160] >= YMM0[191:160] then
//                            //  YMM0[191:160] -> YMM8[191:160]
//                            // else
//                            //  YMM1[191:160] -> YMM8[191:160]
//
//                            // if YMM1[223:192] >= YMM0[223:192] then
//                            //  YMM0[223:192] -> YMM8[223:192]
//                            // else
//                            //  YMM1[223:192] -> YMM8[223:192]
//
//                            // if YMM1[255:224] >= YMM0[255:224] then
//                            //  YMM0[255:224] -> YMM8[255:224]
//                            // else
//                            //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthminsdcomma ( MINSD, )
//
// C prototype:
//  void dg_forthminsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MINSD instruction. This sequence compares the double precision
//   floating point value in the lower 64 bits of the destination with the
//   corresponding double precision floating point value in the lower 64 bits of
//   the source and stores the minimum to the lower 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MINSD,     // if XMM0[63:0] >= [RBX][63:0] then
//                            //  RBX[63:0] -> XMM0[63:0]
//
//  XMM2  XMM0  MINSD,        // if XMM0[63:0] >= XMM2[63:0] then
//                            //  XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM0  MINSD,  // if XMM2[63:0] >= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM2[63:0]
//
//  XMM0  XMM8  MINSD,        // if XMM8[63:0] >= XMM0[63:0] then
//                            //  XMM0[63:0] -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvminsdcomma ( VMINSD, )
//
// C prototype:
//  void dg_forthvminsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter lists for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter lists for target x and target z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMINSD instruction. This sequence compares the double precision
//   floating point value in the lower 64 bits of target y with the
//   corresponding double precision floating point value in the lower 64 bits of
//   the source and stores the minimum to the lower 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMINSD,     // if XMM1[63:0] >= [RBX][63:0] then
//                                   //  RBX[63:0] -> XMM0[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
//  XMM2  XMM1  XMM0  VMINSD,        // if XMM1[63:0] >= XMM2[63:0] then
//                                   //  XMM2[63:0] -> XMM0[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
//  XMM2 <-  XMM1  XMM0  VMINSD,  // if XMM1[63:0] >= XMM0[63:0] then
//                                   //  XMM0[63:0] -> XMM2[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
//  XMM0  XMM1  XMM8  VMINSD,        // if XMM1[63:0] >= XMM0[63:0] then
//                                   //  XMM0[63:0] -> XMM8[63:0]
//                                   // else
//                                   //  XMM1[63:0] -> XMM0[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthminsscomma ( MINSS, )
//
// C prototype:
//  void dg_forthminsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MINSS instruction. This sequence compares the single precision
//   floating point value in the lower 32 bits of the destination with the
//   corresponding single precision floating point value in the lower 32 bits of
//   the source and stores the minimum to the lower 32 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MINSS,     // if XMM0[31:0] >= [RBX][31:0] then
//                            //  then [RBX][31:0] -> XMM0[31:0]
//
//  XMM2  XMM0  MINSS,        // if XMM0[31:0] >= XMM2[31:0] then
//                            //  then XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM0  MINSS,  // if XMM2[31:0] >= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM2[31:0]
//
//  XMM0  XMM8  MINSS,        // if XMM8[31:0] >= XMM0[31:0] then
//                            //  XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvminsscomma ( VMINSS, )
//
// C prototype:
//  void dg_forthvminsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for target x and target z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMINSS instruction. This sequence compares the single precision
//   floating point value in the lower 32 bits of target y with the
//   corresponding single precision floating point value in the lower 32 bits of
//   the source and stores the maximum to the lower 32 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMINSS,     // if XMM1[31:0] >= [RBX][31:0] then
//                                   //  then [RBX][31:0] -> XMM0[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
//  XMM2  XMM1  XMM0  VMINSS,        // if XMM1[31:0] >= XMM2[31:0] then
//                                   //  then XMM2[31:0] -> XMM0[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 <-  XMM1  XMM0  VMINSS,  // if XMM1[31:0] >= XMM0[31:0] then
//                                   //  XMM0[31:0] -> XMM2[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
//  XMM0  XMM1  XMM8  VMINSS,        // if XMM1[31:0] >= XMM0[31:0] then
//                                   //  XMM0[31:0] -> XMM8[31:0]
//                                   // else
//                                   //  then XMM1[31:0] -> XMM0[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmonitorcomma ( MONITOR, )
//
// C prototype:
//  void dg_forthmonitorcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 MONITOR instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  MONITOR,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovcomma ( MOV, )
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   datavalue datasize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               if the other target is a 64 bit register,
//                                 then this is a 64 bit integer, otherwise
//                                 the integer constant gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. For this instruction only and
//                                 only in 64 bit mode when the other target is
//                                 is a register, one instruction is used to
//                                 compile a 64 bit integer. Otherwise if N is
//                                 larger than a signed 32 bit integer, two
//                                 32 bit immediate to mem instructions are used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOV instruction. This opcode sequence copies the sourcetarget to
//   the destinationtarget.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  MOV,  // copies 12348000 to EAX
//  27 N  CL  MOV,             // copies 27 to CL
//  AX  EBX [R]  MOV,          // copies AX to the 16 bit memory at the address
//                             //  in EBX
//  38 N  EDX [R]  32BIT MOV,  // size required, copies 38 to the 32 bit memory
//                             //  at the address in EDX
//  ECX  EAX  MOV,             // copies ECX to EAX
//  ECX <- EAX  MOV,        // copies EAX to ECX
//  HEX 1122334455667788 N RAX MOV,  // copies 1122334455667788 to RAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovbracketntoalcomma ( MOV[N]->AL, )
//
// C prototype:
//  void dg_forthmovbracketntoalcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOV[N]->AL, instruction. This sequence copies the byte at address
//   n to AL.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOV[N]->AL,          //   does [10283] -> AL
//
// Note:
//  The docs show you can put REX in front of this opcode sequence and it
//   might do something with accessing other segments but I didn't
//   understand it and I'm guessing whatever it is doesn't apply in 64
//   bit flat addressing mode.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovbracketntoaxcomma ( MOV[N]->AX, )
//
// C prototype:
//  void dg_forthmovbracketntoaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOV[N]->AX, instruction. This sequence copies the 16 bit value at
//   address n to AX.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOV[N]->AX,          //   does [10283] -> AX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovbracketntoeaxcomma ( MOV[N]->EAX, )
//
// C prototype:
//  void dg_forthmovbracketntoeaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOV[N]->EAX, instruction. This sequence copies the 32 bit value at
//   address n to EAX.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOV[N]->EAX,          //   does [10283] -> EAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovbracketntoraxcomma ( MOV[N]->RAX, )
//
// C prototype:
//  void dg_forthmovbracketntoraxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOV[N]->RAX, instruction. This sequence copies the 64 bit value at
//   address n to RAX.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOV[N]->RAX,          //   does [10283] -> RAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovaltobracketncomma ( MOVAL->[N], )
//
// C prototype:
//  void dg_forthmovaltobracketncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOVAL->[N], instruction. This sequence copies AL to the byte at
//   address n.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOVAL->[N],          //   does AL -> [10283]
//
// Note:
//  The docs show you can put REX in front of this opcode sequence and it
//   might do something with accessing other segments but I didn't
//   understand it and I'm guessing whatever it is doesn't apply in 64
//   bit flat addressing mode.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovaxtobracketncomma ( MOVAX->[N], )
//
// C prototype:
//  void dg_forthmovaxtobracketncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOVAX->[N], instruction. This sequence copies AX into the 16 bit
//   value at address n.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOVAX->[N],          //   does AX -> [10283]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmoveaxtobracketncomma ( MOVEAX->[N], )
//
// C prototype:
//  void dg_forthmoveaxtobracketncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOVEAX->[N], instruction. This sequence copies EAX into the 32 bit
//   value at address n.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOVEAX->[N],          //   does EAX -> [10283]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovraxtobracketncomma ( MOVRAX->[N], )
//
// C prototype:
//  void dg_forthmovraxtobracketncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n -- )
//
// Data stack in:
//
//  n
//
//  Description of target parameters:
//
//   n               in 64 bit address mode:
//                     a number in the range of a 64 bit integer
//                   in 32 bit address mode:
//                     a number in the range of a 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 MOVRAX->[N], instruction. This sequence copies RAX into the 64 bit
//   value at address n.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10283 MOVRAX->[N],          //   does RAX -> [10283]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovcrcomma ( MOVCR, )
//
// C prototype:
//  void dg_forthmovcrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain
//   these addressing mode specifiers:
//
//   targetregister
//   targetregister R
//   targetcontrolregister
//   targetcontrolregister CR
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//
//   targetcontrolregister        one of:
//                                 CR0 CR2 CR3 CR4 CR8
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   R                            specifies a register target.
//
//   CR                           specifies a control register target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVCR instruction. This opcode sequence copies a value from the
//   source to the destination. One target must be a control register, the
//   other target must be a regular register. Register size is ignored for
//   this instruction. Despite what you specify, the current address mode
//   size determines the register size used.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode examples:
//  EAX  CR0  MOVCR,     // EAX -> CR0
//
//  CR0  EAX  MOVCR,     // CR0 -> EAX
//
// 64 bit address mode example:
//  RAX  CR0  MOVCR,     // RAX -> CR0
//
//  CR0  RCX  MOVCR,     // CR0 -> RCX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovapdcomma ( MOVAPD, )
//
// C prototype:
//  void dg_forthmovapdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVAPD instruction. This opcode sequence copies two double floating
//   point values from the source to the destination. One of the targets can be
//   an xmm register, the other can be an xmm register or memory. Memory
//   targets need to be aligned on a 128 bit boundary. Docs say the source has
//   to be two double floating point values. In reality any 128 binary value
//   will work.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVAPD,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  MOVAPD,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  MOVAPD,   // XMM0 -> XMM2
//
//  XMM0 XMM8 MOVAPD,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovapscomma ( MOVAPS, )
//
// C prototype:
//  void dg_forthmovapscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVAPS instruction. This opcode sequence copies four single floating
//   point values from the source to the destination. One of the targets can be
//   an xmm register, the other can be an xmm register or memory. Memory
//   targets need to be aligned on a 128 bit boundary. Docs say the source has
//   to be four single floating point values. In reality any 128 binary value
//   will work.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVAPS,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  MOVAPS,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  MOVAPS,   // XMM0 -> XMM2
//
//  XMM0 XMM8 MOVAPS,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovbecomma ( MOVBE, )
//
// C prototype:
//  void dg_forthmovbecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVBE instruction. This opcode sequence byte reverses the byte
//   order of the value from the source and puts it into the destination.
//   One target must be memory, and the other target must be a regular
//   register. Both targets must be the same size.
//   8 bit targets are not supported.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit address mode examples:
//  RAX [R]  CX  MOVBE,      // [RAX][7:0]   -> CX[15:8]
//                           // [RAX][15:8]  -> CX[7:0]
//
//  EAX  RCX [R]  MOVBE,     // EAX[7:0]     -> [RCX][31:24]
//                           // EAX[15:8]    -> [RCX][23:16]
//                           // EAX[23:16]   -> [RCX][15:8]
//                           // EAX[31:24]   -> [RCX][7:0]
//
//  RAX  RCX [R]  MOVBE,     // EAX[7:0]     -> [RCX][63:56]
//                           // EAX[15:8]    -> [RCX][55:48]
//                           // EAX[23:16]   -> [RCX][47:40]
//                           // EAX[31:24]   -> [RCX][39:32]
//                           // EAX[39:32]   -> [RCX][31:24]
//                           // EAX[47:40]   -> [RCX][23:16]
//                           // EAX[55:48]   -> [RCX][15:8]
//                           // EAX[63:56]   -> [RCX][7:0]
//
// Note:
//  You do not have to specify the data size of the memory target, but if you
//  do it must be the same size as the register target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovdcomma ( MOVD, )
//
// C prototype:
//  void dg_forthmovdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVD instruction. This opcode sequence moves a 32 or 64 bit value
//   from the source to the destination. One target must be an xmm or floating
//   point register. The other target must be a register or memory target.
//   If one of the targets is a memory target and you do not specify the size,
//   a default size of 32 bits will be used. If you use a 64 bit register, or
//   specify 64BIT with a memory target, a 64 bit value will be copied,
//   otherwise a 32 bit value will be copied.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  ST0  EAX  MOVD,              // ST0[31:0] -> EAX
//
//  EAX  ST1  MOVD,              // EAX -> ST0[31:0]
//                               //   0 -> ST0[63:32]
//
//  RAX [R]  ST1  MOVD,          // [RAX][31:0] -> ST1[31:0]
//                               //           0 -> ST1[63:32]
//
//  ST0  RAX  MOVD,              // ST0 -> RAX
//
//  RAX [R] 64BIT  ST1  MOVD,    // [RAX][63:0] -> ST1
//
//  XMM0  ECX  MOVD,             // XMM0[31:0] -> ECX
//
//  ECX  XMM0  MOVD,             // ECX -> XMM0[31:0]
//                               //   0 -> XMM0[127:32]
//
//  RCX [R]  XMM0  MOVD,         // [RCX][31:0] -> XMM0[31:0]
//                               //           0 -> XMM0[127:32]
//
//  RCX [R] 64BIT  XMM0  MOVD,   // [RCX][63:0] -> XMM0[63:0]
//                               //           0 -> XMM0[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovq2comma ( MOVQ2, )
//
// C prototype:
//  void dg_forthmovq2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVQ2 instruction. This opcode sequence moves a 32 or 64 bit value
//   from the source to the destination. One target must be an xmm or floating
//   point register. The other target must be a register or memory target.
//   If one of the targets is a memory target and you do not specify the size,
//   a default size of 64 bits will be used. If you use a 32 bit register, or
//   specify 32BIT with a memory target, a 32 bit value will be copied,
//   otherwise a 64 bit value will be copied.
//   I renamed this compiling word MOVQ2 because there is another compiling
//   word with the name MOVQ.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  ST0  EAX  MOVQ2,             // ST0[31:0] -> EAX
//
//  EAX  ST1  MOVQ2,             // EAX -> ST0[31:0]
//                               //   0 -> ST0[63:32]
//
//  RAX [R] 32BIT  ST1  MOVQ2,   // [RAX][31:0] -> ST1[31:0]
//                               //           0 -> ST1[63:32]
//
//  ST0  RAX  MOVQ2,             // ST0 -> RAX
//
//  RAX [R]  ST1  MOVQ2,         // [RAX][63:0] -> ST1
//
//  XMM0  ECX  MOVQ2,            // XMM0[31:0] -> ECX
//
//  ECX  XMM0  MOVQ2,            // ECX -> XMM0[31:0]
//                               //   0 -> XMM0[127:32]
//
//  RCX [R] 32BIT  XMM0  MOVQ2,  // [RCX][31:0] -> XMM0[31:0]
//                               //           0 -> XMM0[127:32]
//
//  RCX [R]  XMM0  MOVQ2,        // [RCX][63:0] -> XMM0[63:0]
//                               //           0 -> XMM0[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovdcomma ( VMOVD, )
//
// C prototype:
//  void dg_forthvmovdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVD instruction. This opcode sequence moves a 32 bit value
//   from the source to the destination. One target must be an xmm register.
//   The other target must be a register or memory target.
//   If one of the targets is a memory target and you do not specify the size,
//   a default size of 32 bits will be used. If the destination is an xmm
//   or ymm register, the 32 bit value gets zero extended to the size of the
//   destination register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  XMM0  ECX  VMOVD,             // XMM0[31:0] -> ECX
//
//  ECX  XMM0  VMOVD,             // ECX -> XMM0[31:0]
//                                //   0 -> XMM0[127:32]
//
//  RCX [R]  XMM0  VMOVD,         // [RCX][31:0] -> XMM0[31:0]
//                                //           0 -> XMM0[127:32]
//
//  RCX [R] 64BIT  XMM0  VMOVD,   // [RCX][63:0] -> XMM0[63:0]
//                                //           0 -> XMM0[127:64]
//
// Note:
//  If you specify a memory target size, it must be 32 bits.
//  Regular register target sizes are ignored for this instruction.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovq2comma ( VMOVQ2, )
//
// C prototype:
//  void dg_forthvmovq2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVQ2 instruction. This opcode sequence moves a 64 bit value
//   from the source to the destination. One target must be an xmm register.
//   The other target must be a register or memory target.
//   If one of the targets is a memory target and you do not specify the size,
//   a default size of 64 bits will be used. If the destination is an xmm
//   or ymm register, the 64 bit value gets zero extended to the size of the
//   destination register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  XMM0  ECX  VMOVQ2,             // XMM0[31:0] -> ECX
//
//  ECX  XMM0  VMOVQ2,             // ECX -> XMM0[31:0]
//                                 //   0 -> XMM0[127:32]
//
//  RCX [R]  XMM0  VMOVQ2,         // [RCX][31:0] -> XMM0[31:0]
//                                 //           0 -> XMM0[127:32]
//
//  RCX [R] 64BIT  XMM0  VMOVQ2,   // [RCX][63:0] -> XMM0[63:0]
//                                 //           0 -> XMM0[127:64]
//
// Note:
//  If you specify a memory target size, it must be 64 bits.
//  Regular register target sizes are ignored for this instruction.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovddupcomma ( MOVDDUP, )
//
// C prototype:
//  void dg_forthmovddupcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVDDUP instruction. This sequence copies the double precision
//   floating point value in the lower 64 bits of the source to the lower
//   64 bits of the destination and also the upper 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVDDUP,    // RBX[63:0] -> XMM0[63:0]
//                             // RBX[63:0] -> XMM0[127:64]
//
//  XMM2  XMM0  MOVDDUP,       // XMM2[63:0] -> XMM0[63:0]
//                             // XMM2[63:0] -> XMM0[127:64]
//
//  XMM2 <-  XMM0  MOVDDUP, // XMM0[63:0] -> XMM2[63:0]
//                             // XMM0[63:0] -> XMM2[127:64]
//
//  XMM0  XMM8  MOVDDUP,       // XMM0[63:0] -> XMM8[63:0]
//                             // XMM0[63:0] -> XMM8[127:63]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovddupcomma ( VMOVDDUP, )
//
// C prototype:
//  void dg_forthvmovddupcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVDDUP instruction. If the destination is an xmm register, then 
//   this sequence copies the double precision floating point value in the first 
//   (lowest) 64 bit section of the source to the first (lowest) 64 bit section 
//   of the  destination and also the second 64 bit section of the destination.
//   If the destination is a ymm register, then this opcode sequence does the
//   same thing as if it was an xmm register then also copies the third 64 bit
//   section of the source to the third and fourth 64 bit sections of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVDDUP,    // RBX[63:0]     -> XMM0[63:0]
//                              // RBX[63:0]     -> XMM0[127:64]
//
//  XMM2  XMM0  VMOVDDUP,       // XMM2[63:0]    -> XMM0[63:0]
//                              // XMM2[63:0]    -> XMM0[127:64]
//
//  XMM2 <-  XMM0  VMOVDDUP, // XMM0[63:0]    -> XMM2[63:0]
//                              // XMM0[63:0]    -> XMM2[127:64]
//
//  YMM0  YMM8  VMOVDDUP,       // YMM0[63:0]    -> YMM8[63:0]
//                              // YMM0[63:0]    -> YMM8[127:63]
//                              // YMM0[191:127] -> YMM8[191:127]
//                              // YMM0[191:127] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovdqucomma ( MOVDQU, )
//
// C prototype:
//  void dg_forthmovdqucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVDQU instruction. This opcode sequence copies a 128 bit value
//   from the source to the destination. One of the targets can be
//   an xmm register, the other can be an xmm register or memory. Memory
//   targets do NOT need to be aligned on a 128 bit boundary.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVDQU,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  MOVDQU,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  MOVDQU,   // XMM0 -> XMM2
//
//  XMM0 XMM8 MOVDQU,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovdqucomma ( VMOVDQU, )
//
// C prototype:
//  void dg_forthvmovdqucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVDQU instruction. This opcode sequence copies a 128 bit or 256
//   bit value from the source to the destination. One of the targets can be
//   an xmm register or ymm register, the other can be an xmm register,
//   ymm register, or memory. Memory targets do not need to be aligned to
//   a boundary. If both targets are register targets, they both have to be
//   xmm registers or both have to be ymm registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVDQU,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  VMOVDQU,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  VMOVDQU,   // XMM0 -> XMM2
//
//  XMM0 XMM8 VMOVDQU,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//  Using XMMR or YMMR to specify the register target when using a memory target
//   will force 3 byte vex encoding even when a 2 byte vex form is available.
//   When both targets are registers, using XMMR or YMMR on the destination
//   target forces 3 byte vex.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovqcomma ( VMOVQ, )
//
// C prototype:
//  void dg_forthvmovqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVQ instruction. This opcode sequence copies a 64 bit value from
//   the source to the destination. One of the targets can be
//   an xmm register, the other can be an xmm register or memory. If the
//   destination is an XMM register, the 64 bit result gets zero extended to
//   128 bits.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVQ,     // [RBX][63:0] -> XMM0[63:0]
//                            // 0           -> XMM0[127:64]
//
//  XMM2  XMM0  VMOVQ,        // XMM2[63:0] -> XMM0[63:0]
//                            // 0          -> XMM0[127:64]
//
//  XMM2 <- XMM0  VMOVQ,   // XMM0[63:0] -> XMM2[63:0]
//                            // 0          -> XMM2[127:64]
//
//  XMM0 XMM8 VMOVQ,          // XMM0[63:0] -> XMM8[63:0]
//                            // 0          -> XMM8[127:64]
//
//
// Note:
//  Only 1 target can be a memory target.
//  Using XMMR to specify the register target when using a memory target
//   will force 3 byte vex encoding even when a 2 byte vex form is available.
//   When both targets are registers, using XMMR on the destination
//   target forces 3 byte vex.
//  Intel docs says MOVQ does not require aligment for memory targets.
//   (section 14.9 Memory Alignment)
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovupdcomma ( VMOVUPD, )
//
// C prototype:
//  void dg_forthvmovupdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVUPD instruction. This opcode sequence copies a 128 bit or 256
//   bit value from the source to the destination. One of the targets can be
//   an xmm register or ymm register, the other can be an xmm register,
//   ymm register, or memory. Memory targets do not need to be aligned to
//   a boundary. If both targets are register targets, they both have to be
//   xmm registers or both have to be ymm registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVUPD,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  VMOVUPD,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  VMOVUPD,   // XMM0 -> XMM2
//
//  XMM0 XMM8 VMOVUPD,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//  Using XMMR or YMMR to specify the register target when using a memory target
//   will force 3 byte vex encoding even when a 2 byte vex form is available.
//   When both targets are registers, using XMMR or YMMR on the destination
//   target forces 3 byte vex.
//  Intel docs says this instruction moves 2 64 bit floating point values, but
//   it can be any 128 bit value.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovupscomma ( VMOVUPS, )
//
// C prototype:
//  void dg_forthvmovupscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVUPS instruction. This opcode sequence copies a 128 bit or 256
//   bit value from the source to the destination. One of the targets can be
//   an xmm register or ymm register, the other can be an xmm register,
//   ymm register, or memory. Memory targets do not need to be aligned to
//   a boundary. If both targets are register targets, they both have to be
//   xmm registers or both have to be ymm registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVUPS,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  VMOVUPS,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  VMOVUPS,   // XMM0 -> XMM2
//
//  XMM0 XMM8 VMOVUPS,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//  Using XMMR or YMMR to specify the register target when using a memory target
//   will force 3 byte vex encoding even when a 2 byte vex form is available.
//   When both targets are registers, using XMMR or YMMR on the destination
//   target forces 3 byte vex.
//  Intel docs says this instruction moves 4 32 bit floating point values, but
//   it can be any 128 bit value.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmaskmovpdcomma ( VMASKMOVPD, )
//
// C prototype:
//  void dg_forthvmaskmovpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing
//   mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist and targetzparemeterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   and targetzparemeterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMASKMOVPD instruction. This opcode sequence copies 64 bit values
//   from the source to the destination based on the value of the mask.
//   targetxparameterlist and targetzparameterlist are the source and
//   destination. One of the source and destination targets must be a memory
//   target and the other must be an xmm or ymm register. targetyparameterlist
//   is the mask and must be an xmm or ymm register. If the high bit of a 64 bit
//   section of the maks is set, then the corresponding 64 bit section is copied
//   from the source to the destination. If the high bit of a 64 bit section is
//   clear and the destination is an xmm or ymm register, the corresponding
//   64 bit section is cleared. If the high bit of a 64 bit section is clear
//   and the destination is memory, the corresponding 64 bit section of the
//   memory target is left unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMASKMOVPD,     // if XMM1[63] is 1 then
//                                       //  [RBX][63:0]   -> XMM0[63:0]
//                                       // else
//                                       //  0             -> XMM0[63:0]
//                                       // if XMM1[127] is 1 then
//                                       //  [RBX][127:64] -> XMM0[127:64]
//                                       // else
//                                       //  0             -> XMM0[127:64]
//
//  XMM0  XMM1  RBX [R]  VMASKMOVPD,     // if XMM1[63] is 1 then
//                                       //  XMM0[63:0]   -> [RBX][63:0]
//                                       // if XMM1[127] is 1 then
//                                       //  XMM0[127:64] -> [RBX][127:64]
//
//
// Note:
//  Both the source and destination can not be xmm/ymm registers, one must be
//   a memory target. Both also can not be memory.
//  Intel docs says this instruction moves 2 or 4 64 bit floating point values,
//   but they can be any 64 bit values.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmaskmovpscomma ( VMASKMOVPS, )
//
// C prototype:
//  void dg_forthvmaskmovpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing
//   mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist and targetzparemeterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   and targetzparemeterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMASKMOVPS instruction. This opcode sequence copies 32 bit values
//   from the source to the destination based on the value of the mask.
//   targetxparameterlist and targetzparameterlist are the source and
//   destination. One of the source and destination targets must be a memory
//   target and the other must be an xmm or ymm register. targetyparameterlist
//   is the mask and must be an xmm or ymm register. If the high bit of a 32 bit
//   section of the maks is set, then the corresponding 32 bit section is copied
//   from the source to the destination. If the high bit of a 32 bit section is
//   clear and the destination is an xmm or ymm register, the corresponding
//   32 bit section is cleared. If the high bit of a 32 bit section is clear
//   and the destination is memory, the corresponding 32 bit section of the
//   memory target is left unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VMASKMOVPS,     // if XMM1[31] is 1 then
//                                       //  [RBX][31:0]   -> XMM0[31:0]
//                                       // else
//                                       //  0             -> XMM0[31:0]
//                                       // if XMM1[63] is 1 then
//                                       //  [RBX][63:32]  -> XMM0[63:32]
//                                       // else
//                                       //  0             -> XMM0[63:0]
//                                       // if XMM1[95] is 1 then
//                                       //  [RBX][95:64]  -> XMM0[95:64]
//                                       // else
//                                       //  0             -> XMM0[95:64]
//                                       // if XMM1[127] is 1 then
//                                       //  [RBX][127:96] -> XMM0[127:96]
//                                       // else
//                                       //  0             -> XMM0[127:96]
//
//  XMM0  XMM1  RBX [R]  VMASKMOVPS,     // if XMM1[31] is 1 then
//                                       //  XMM0[31:0]   -> [RBX][31:0]
//                                       // if XMM1[63] is 1 then
//                                       //  XMM0[63:32]  -> [RBX][63:32]
//                                       // if XMM1[95] is 1 then
//                                       //  XMM0[95:64]  -> [RBX][95:64]
//                                       // if XMM1[127] is 1 then
//                                       //  XMM0[127:96] -> [RBX][127:96]
//
//
// Note:
//  Both the source and destination can not be xmm/ymm registers, one must be
//   a memory target. Both also can not be memory.
//  Intel docs says this instruction moves 4 or 8 32 bit floating point values,
//   but they can be any 32 bit values.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaskmovqcomma ( VPMASKMOVQ, )
//
// C prototype:
//  void dg_forthvpmaskmovqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing
//   mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist and targetzparemeterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   and targetzparemeterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMASKMOVQ instruction. This opcode sequence copies 64 bit values
//   from the source to the destination based on the value of the mask.
//   targetxparameterlist and targetzparameterlist are the source and
//   destination. One of the source and destination targets must be a memory
//   target and the other must be an xmm or ymm register. targetyparameterlist
//   is the mask and must be an xmm or ymm register. If the high bit of a 64 bit
//   section of the maks is set, then the corresponding 64 bit section is copied
//   from the source to the destination. If the high bit of a 64 bit section is
//   clear and the destination is an xmm or ymm register, the corresponding
//   64 bit section is cleared. If the high bit of a 64 bit section is clear
//   and the destination is memory, the corresponding 64 bit section of the
//   memory target is left unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPMASKMOVPQ,    // if XMM1[63] is 1 then
//                                       //  [RBX][63:0]   -> XMM0[63:0]
//                                       // else
//                                       //  0             -> XMM0[63:0]
//                                       // if XMM1[127] is 1 then
//                                       //  [RBX][127:64] -> XMM0[127:64]
//                                       // else
//                                       //  0             -> XMM0[127:64]
//
//  XMM0  XMM1  RBX [R]  VPMASKMOVPQ,    // if XMM1[63] is 1 then
//                                       //  XMM0[63:0]   -> [RBX][63:0]
//                                       // if XMM1[127] is 1 then
//                                       //  XMM0[127:64] -> [RBX][127:64]
//
//
// Note:
//  Both the source and destination can not be xmm/ymm registers, one must be
//   a memory target. Both also can not be memory.
//  Intel docs say unlike VMASKMOVPD, this instruction does not issue a non
//   temporal hint, and can be used to access memory mapped I/O.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaskmovdcomma ( VPMASKMOVD, )
//
// C prototype:
//  void dg_forthvpmaskmovdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain these addressing
//   mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist and targetzparemeterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   and targetzparemeterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMASKMOVD instruction. This opcode sequence copies 32 bit values
//   from the source to the destination based on the value of the mask.
//   targetxparameterlist and targetzparameterlist are the source and
//   destination. One of the source and destination targets must be a memory
//   target and the other must be an xmm or ymm register. targetyparameterlist
//   is the mask and must be an xmm or ymm register. If the high bit of a 32 bit
//   section of the maks is set, then the corresponding 32 bit section is copied
//   from the source to the destination. If the high bit of a 32 bit section is
//   clear and the destination is an xmm or ymm register, the corresponding
//   32 bit section is cleared. If the high bit of a 32 bit section is clear
//   and the destination is memory, the corresponding 32 bit section of the
//   memory target is left unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPMASKMOVD,     // if XMM1[31] is 1 then
//                                       //  [RBX][31:0]   -> XMM0[31:0]
//                                       // else
//                                       //  0             -> XMM0[31:0]
//                                       // if XMM1[63] is 1 then
//                                       //  [RBX][63:32]  -> XMM0[63:32]
//                                       // else
//                                       //  0             -> XMM0[63:0]
//                                       // if XMM1[95] is 1 then
//                                       //  [RBX][95:64]  -> XMM0[95:64]
//                                       // else
//                                       //  0             -> XMM0[95:64]
//                                       // if XMM1[127] is 1 then
//                                       //  [RBX][127:96] -> XMM0[127:96]
//                                       // else
//                                       //  0             -> XMM0[127:96]
//
//  XMM0  XMM1  RBX [R]  VPMASKMOVD,     // if XMM1[31] is 1 then
//                                       //  XMM0[31:0]   -> [RBX][31:0]
//                                       // if XMM1[63] is 1 then
//                                       //  XMM0[63:32]  -> [RBX][63:32]
//                                       // if XMM1[95] is 1 then
//                                       //  XMM0[95:64]  -> [RBX][95:64]
//                                       // if XMM1[127] is 1 then
//                                       //  XMM0[127:96] -> [RBX][127:96]
//
//
// Note:
//  Both the source and destination can not be xmm/ymm registers, one must be
//   a memory target. Both also can not be memory.
//  Intel docs say unlike VMASKMOVPD, this instruction does not issue a non
//   temporal hint, and can be used to access memory mapped I/O.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovdqacomma ( MOVDQA, )
//
// C prototype:
//  void dg_forthmovdqacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVDQA instruction. This opcode sequence copies a 128 bit value
//   from the source to the destination. One of the targets can be
//   an xmm register, the other can be an xmm register or memory. Memory
//   targets need to be aligned on a 128 bit boundary.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVDQA,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  MOVDQA,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  MOVDQA,   // XMM0 -> XMM2
//
//  XMM0 XMM8 MOVDQA,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovdqacomma ( VMOVDQA, )
//
// C prototype:
//  void dg_forthvmovdqacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVDQA instruction. This opcode sequence copies a 128 bit or 256
//   bit value from the source to the destination. One of the targets can be
//   an xmm register or ymm register, the other can be an xmm register,
//   ymm register, or memory. If the other target is an xmm register, then
//   the memory target needs to be aligned on a 128 bit boundary (16 byte).
//   If the other target is a ymm register, then the other target needs to be
//   aligned on a 256 bit (32 byte) )boundary. If both targets are register
//   targets, they both have to be xmm registers or both have to be ymm registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVDQA,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  VMOVDQA,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  VMOVDQA,   // XMM0 -> XMM2
//
//  XMM0 XMM8 VMOVDQA,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//  Using XMMR or YMMR to specify the register target when using a memory target
//   will force 3 byte vex encoding even when a 2 byte vex form is available.
//   When both targets are registers, using XMMR or YMMR on the destination
//   target forces 3 byte vex.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovapdcomma ( VMOVAPD, )
//
// C prototype:
//  void dg_forthvmovapdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVAPD instruction. This opcode sequence copies a 128 bit or 256
//   bit value from the source to the destination. One of the targets can be
//   an xmm register or ymm register, the other can be an xmm register,
//   ymm register, or memory. If the other target is an xmm register, then
//   the memory target needs to be aligned on a 128 bit boundary (16 byte).
//   If the other target is a ymm register, then the other target needs to be
//   aligned on a 256 bit (32 byte) )boundary. If both targets are register
//   targets, they both have to be xmm registers or both have to be ymm registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVAPD,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  VMOVAPD,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  VMOVAPD,   // XMM0 -> XMM2
//
//  XMM0 XMM8 VMOVAPD,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//  Using XMMR or YMMR to specify the register target when using a memory target
//   will force 3 byte vex encoding even when a 2 byte vex form is available.
//   When both targets are registers, using XMMR or YMMR on the destination
//   target forces 3 byte vex.
//  Intel docs say this opcode sequence is for 2 or 4 64 bit floating point
//   values, but any 128 bit or 256 value will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovapscomma ( VMOVAPS, )
//
// C prototype:
//  void dg_forthvmovapscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVAPS instruction. This opcode sequence copies a 128 bit or 256
//   bit value from the source to the destination. One of the targets can be
//   an xmm register or ymm register, the other can be an xmm register,
//   ymm register, or memory. If the other target is an xmm register, then
//   the memory target needs to be aligned on a 128 bit boundary (16 byte).
//   If the other target is a ymm register, then the other target needs to be
//   aligned on a 256 bit (32 byte) )boundary. If both targets are register
//   targets, they both have to be xmm registers or both have to be ymm registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VMOVAPS,     // [RBX][127:0] -> XMM0
//
//  XMM2  XMM0  VMOVAPS,        // XMM2 -> XMM0
//
//  XMM2 <- XMM0  VMOVAPS,   // XMM0 -> XMM2
//
//  XMM0 XMM8 VMOVAPS,          // XMM0 -> XMM8
//
//
// Note:
//  Only 1 target can be a memory target.
//  Using XMMR or YMMR to specify the register target when using a memory target
//   will force 3 byte vex encoding even when a 2 byte vex form is available.
//   When both targets are registers, using XMMR or YMMR on the destination
//   target forces 3 byte vex.
//  Intel docs say this opcode sequence is for 4 or 8 32 bit floating point
//   values, but any 128 bit or 256 value will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovdq2qcomma ( MOVDQ2Q, )
//
// C prototype:
//  void dg_forthmovdq2qcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVDQ2Q instruction. This opcode sequence moves a 64 bit value
//   from the lower 64 bits of the source to the destination. The source
//   target must be an xmm register. The destination target must be a floating
//   point register. Reverse is not supported.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  ST0  MOVDQ2Q,             // XMM1[63:0] -> ST0
//
//  XMM0  ST3  MOVDQ2Q,             // XMM0[63:0] -> ST3
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovdrcomma ( MOVDR, )
//
// C prototype:
//  void dg_forthmovdrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain
//   these addressing mode specifiers:
//
//   targetregister
//   targetregister R
//   targetdebugregister
//   targetdebugregister CR
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//
//   targetdebugregister          one of:
//                                 DR0 DR1 DR2 DR3 DR4 DR5 DR6 DR7
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   R                            specifies a register target.
//
//   CR                           specifies a control register target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVDR instruction. This opcode sequence copies a value from the
//   source to the destination. One target must be a debug register, the
//   other target must be a regular register. Register size is ignored for
//   this instruction. Despite what you specify, the current address mode
//   size determines the register size used.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode examples:
//  EAX  CR0  MOVDR,     // EAX -> DR0
//
//  CR0  EAX  MOVDR,     // DR0 -> EAX
//
// 64 bit address mode example:
//  RAX  CR0  MOVDR,     // RAX -> DR0
//
//  CR0  RCX  MOVDR,     // DR0 -> RCX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsrcomma ( MOVSR, )
//
// C prototype:
//  void dg_forthmovsrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetsegmentregister
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   targetsegmentregister SR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetsegmentregister        one of:
//                                 SRCS SRDS SRSS SRES SRFS SRGS
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 MOVSR instruction. This opcode sequence copies a 16 bit value
//   from the source to the destination. One of the targets must be a segment
//   register. The other must be a register or memory target. If you use
//   the SR specifier with the segment register, that means you want
//   prefixes. If you use the SR specifier with a 64 bit destination register,
//   then you will get the rex.w prefix and the value will be zero
//   extended from 16 bits to 64 bits.
//   I may change how you choose to get the prefixes in the future...
//    2/29/2020 J.N.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  SRDS  MOVSR,            // [RAX][15:0] -> SRDS
//
//  SRCX  RDX [R]  MOVSR,            // SRCX -> [RDX][15:0]
//
//  SRDX SR  RAX  MOVSR,             // SRDX -> RAX[15:0] - rex.w prefix
//                                   //    0 -> RAX[127:16]
//
//  SRDX  RAX  MOVSR,                // SRDX -> RAX[15:0] - no rex.w prefix
//
//  SRDX  AX  MOVSR,                 // SRDX -> AX[15:0] - no 0x66 prefix
//
//  SRDX SR  AX  MOVSR,              // SRDX -> AX[15:0] - 0x66 prefix
//
//  RAX  SRSS  MOVSR,                // RAX[15:0] -> SRSS
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovhlpscomma ( MOVHLPS, )
//
// C prototype:
//  void dg_forthmovhlpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these target parameter lists can contain these
//   addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVHLPS instruction. This opcode sequence copies a 64 bit value
//   from the high 64 bits of the source to the low 64 bits of the destination.
//   Both the source and destination targets must be an xmm registers. The high
//   64 bits of the destination are not changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  XMM0  MOVHLPS,            // XMM1[127:64] -> XMM0[63:0]
//
//  XMM0  ST3  MOVHLPS,             // XMM0[127:64] -> XMM3[63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovhlpscomma ( VMOVHLPS, )
//
// C prototype:
//  void dg_forthvmovhlpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter lists for all three targets can contain these
//   addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVHLPS instruction. This opcode sequence copies a 64 bit value
//   from the high 64 bits of the source to the low 64 bits of the destination.
//   This opcode sequence also copies a 64 bit value from the high 64 bits of
//   target y to the high 64 bits of the destination.
//   All three targets must be an xmm registers. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  XMM2  XMM0  MOVHLPS,            // XMM1[127:64] -> XMM0[63:0]
//                                        // XMM2[127:64] -> XMM0[127:64]
//
//  XMM0 <- XMM2  XMM1  MOVHLPS,       // XMM1[127:64] -> XMM0[63:0]
//                                        // XMM2[127:64] -> XMM0[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovhpdcomma ( MOVHPD, )
//
// C prototype:
//  void dg_forthmovhpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVHPD instruction. This opcode sequence moves a 64 bit value
//   from the source to the destination. One target must be an xmm register.
//   The other target must be a memory target. The data is moved from the
//   64 bit memory target to/from the high 64 bits of the xmm register target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  MOVHPD,             // XMM1[127:64] -> [RAX][63:0]
//
//  XMM0  RCX [R]  MOVHPD,             // XMM0[127:64] -> [RCX][63:0]
//
//  RAX [R]  XMM3  MOVHPD,             // [RAX][63:0] -> XMM3[127:64]
//
// Note:
//  Intel docs say this moves
//   a single double precision floating point value, but it doesn't matter
//   what kind of data is in the 64 bits. This compiling word uses a rex.w
//   prefix. If you want a shorter opcode sequence, use MOVHPS. It does the
//   same thing as this without the rex.w prefix.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovhpdcomma ( VMOVHPD, )
//
// C prototype:
//  void dg_forthvmovhpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an xmm register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVHPD instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the memory target source to the high 64 bits of
//   the xmm register destination. It also moves 64 bits from the low xmm
//   register second source to the low 64 bits of the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  RAX [R]  XMM0  XMM3  VMOVHPD,             // [RAX][63:0] -> XMM3[127:64]
//                                            // XMM0[63:0]  -> XMM3 [64:0]
//
// Note:
//  Intel docs say this moves double precision floating point values,
//   but it doesn't matter what kind of data is in the 64 bits.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovhpd2comma ( VMOVHPD2, )
//
// C prototype:
//  void dg_forthvmovhpd2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size.
//                                 (8 bits only for this instruction)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an XMM register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVHPD2 instruction. This opcode sequence moves a 64 bit value
//   from the high 64 bits of the source xmm register to the 64 bit memory
//   target destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  VMOVHPD2,             // XMM1[127:64] -> [RAX][63:0]
//
//  XMM0  RCX [R]  VMOVHPD2,             // XMM0[127:64] -> [RCX][63:0]
//
//
// Note:
//  Intel docs say this moves a single double precision floating point value,
//    but it doesn't matter what kind of data is in the 64 bits.
//  This is the VMOVHPD xmm register to memory instruction. I added the 2
//   because it uses a different number of targets than when memory is
//   the source.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovhpscomma ( VMOVHPS, )
//
// C prototype:
//  void dg_forthvmovhpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an xmm register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVHPS instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the memory target source to the high 64 bits of
//   the xmm register destination. It also moves 64 bits from the low xmm
//   register second source to the low 64 bits of the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  RAX [R]  XMM0  XMM3  VMOVHPS,             // [RAX][63:0] -> XMM3[127:64]
//                                            // XMM0[63:0]  -> XMM3 [63:0]
//
// Note:
//  Intel docs say this moves single precision floating point values,
//   but it doesn't matter what kind of data is in the 64 bits.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovhps2comma ( VMOVHPS2, )
//
// C prototype:
//  void dg_forthvmovhps2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an XMM register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVHPS2 instruction. This opcode sequence moves a 64 bit value
//   from the high 64 bits of the source xmm register to the 64 bit memory
//   target destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  VMOVHPS2,             // XMM1[127:64] -> [RAX][63:0]
//
//  XMM0  RCX [R]  VMOVHPS2,             // XMM0[127:64] -> [RCX][63:0]
//
//
// Note:
//  Intel docs say this moves single precision floating point values,
//    but it doesn't matter what kind of data is in the 64 bits.
//  This is the VMOVHPS xmm register to memory instruction. I added the 2
//   because it uses a different number of targets than when memory is
//   the source.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovhpscomma ( MOVHPS, )
//
// C prototype:
//  void dg_forthmovhpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVHPS instruction. This opcode sequence moves a 64 bit value
//   from the source to the destination. One target must be an xmm register.
//   The other target must be a memory target. The data is moved from the
//   64 bit memory target to/from the high 64 bits of the xmm register target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  MOVHPS,             // XMM1[127:64] -> [RAX][63:0]
//
//  XMM0  RCX [R]  MOVHPS,             // XMM0[127:64] -> [RCX][63:0]
//
//  RAX [R]  XMM3  MOVHPS,             // [RAX][63:0] -> XMM3[127:64]
//
// Note:
//  The only difference between
//   this compiling word and MOVHPD is that this compiling word does not
//   use a rex.w prefix in the opcode sequence. Intel docs say this
//   opcode sequence moves two single floating point values, but it doesn't
//   matter what kind of data is in the 64 bits.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovlhpscomma ( MOVLHPS, )
//
// C prototype:
//  void dg_forthmovlhpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these target parameter lists can contain these
//   addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVLHPS instruction. This opcode sequence copies a 64 bit value
//   from the lower 64 bits of the source to the upper 64 bits of the destination.
//   Both the source and destination targets must be an xmm registers. The lower
//   64 bits of the destination are not changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  XMM0  MOVLHPS,            // XMM1[63:0] -> XMM0[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovlhpscomma ( VMOVLHPS, )
//
// C prototype:
//  void dg_forthvmovlhpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these target parameter lists can contain these
//   addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVLHPS instruction. This opcode sequence copies a 64 bit value
//   from the lower 64 bits of the source to the upper 64 bits of the destination.
//   This opcode sequence also copies a 64 bit value from the lower 64 bits of
//   target y to the lower 64 bits of the destination.
//   All three targets must be an xmm registers. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  XMM2  XMM0  VMOVLHPS,            // XMM1[63:0] -> XMM0[127:64]
//                                         // XMM2[63:0] -> XMM0[63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovlpdcomma ( VMOVLPD, )
//
// C prototype:
//  void dg_forthvmovlpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an xmm register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVLPD instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the memory target source to the low 64 bits of
//   the xmm register destination. It also moves 64 bits from the high xmm
//   register second source to the high 64 bits of the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  RAX [R]  XMM0  XMM3  VMOVLPD,             // [RAX][63:0] -> XMM3[63:0]
//                                            // XMM0[127:64]  -> XMM3 [127:64]
//
// Note:
//  Intel docs say this moves double precision floating point values,
//   but it doesn't matter what kind of data is in the 64 bits.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovlpd2comma ( VMOVLPD2, )
//
// C prototype:
//  void dg_forthvmovlpd2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an XMM register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVLPD2 instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the source xmm register to the 64 bit memory
//   target destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  VMOVLPD2,             // XMM1[63:0] -> [RAX][63:0]
//
//  XMM0  RCX [R]  VMOVLPD2,             // XMM0[63:0] -> [RCX][63:0]
//
//
// Note:
//  Intel docs say this moves a single double precision floating point value,
//    but it doesn't matter what kind of data is in the 64 bits.
//  This is the VMOVLPD xmm register to memory instruction. I added the 2
//   because it uses a different number of targets than when memory is
//   the source.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovlpscomma ( VMOVLPS, )
//
// C prototype:
//  void dg_forthvmovlpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an xmm register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVLPS instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the memory target source to the low 64 bits of
//   the xmm register destination. It also moves 64 bits from the high xmm
//   register second source to the high 64 bits of the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  RAX [R]  XMM0  XMM3  VMOVLPS,             // [RAX][63:0] -> XMM3[63:0]
//                                            // XMM0[127:64]  -> XMM3 [127:64]
//
// Note:
//  Intel docs say this moves double precision floating point values,
//   but it doesn't matter what kind of data is in the 64 bits.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovlps2comma ( VMOVLPS2, )
//
// C prototype:
//  void dg_forthvmovlps2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an XMM register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVLPS2 instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the source xmm register to the 64 bit memory
//   target destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  VMOVLPS2,             // XMM1[63:0] -> [RAX][63:0]
//
//  XMM0  RCX [R]  VMOVLPS2,             // XMM0[63:0] -> [RCX][63:0]
//
//
// Note:
//  Intel docs say this moves single precision floating point values,
//    but it doesn't matter what kind of data is in the 64 bits.
//  This is the VMOVLPS xmm register to memory instruction. I added the 2
//   because it uses a different number of targets than when memory is
//   the source.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovsdcomma ( VMOVSD, )
//
// C prototype:
//  void dg_forthvmovsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmm register target.
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVSD instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the xmm register source to the low 64 bits of
//   the xmm register destination. It also moves 64 bits from the high xmm
//   register second source to the high 64 bits of the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  XMM2  XMM0  XMM3  VMOVSD,                // XMM2[63:0]    -> XMM3[63:0]
//                                           // XMM0[127:64]  -> XMM3 [127:64]
//
// Note:
//  Intel docs say this moves double precision floating point values,
//   but it doesn't matter what kind of data is in the 64 bits.
//  There are two opcode sequences for this instruction. If you use reverse with
//   this instruction you will get the alternate opcode sequence.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovsd2comma ( VMOVSD2, )
//
// C prototype:
//  void dg_forthvmovsd2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an XMM register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVSD2 instruction. This opcode sequence moves a 64 bit value
//   from the low 64 bits of the source xmm register to the 64 bit memory
//   target destination. This opcode sequence can also move a 64 bit value
//   from a 64 bit memory target source to the low 64 bits of an xmm
//   register destination. Both target can not be memory, and both targets
//   can not be xmm registers. If the destination is an xmm register, the
//   upper bits of the destination xmm register are cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  VMOVSD2,             // XMM1[63:0]  -> [RAX][63:0]
//
//  XMM0  RCX [R]  VMOVSD2,             // XMM0[63:0]  -> [RCX][63:0]
//
//  RCX [R]  XMM0  VMOVSD2,             // [RCX][63:0] -> XMM0[63:0]
//                                      //           0 -> XMM0[127:64]
//
// Note:
//  Intel docs say this moves single precision floating point values,
//    but it doesn't matter what kind of data is in the 64 bits.
//  This is the VMOVSD xmm register to from memory instruction. I added the 2
//   because it uses a different number of targets than when only xmm registers
//   are used.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovsscomma ( VMOVSS, )
//
// C prototype:
//  void dg_forthvmovsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmm register target.
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVSS instruction. This opcode sequence moves a 32 bit value
//   from the low 32 bits of the xmm register source to the low 32 bits of
//   the xmm register destination. It also moves 96 bits from the high xmm
//   register second source to the high 96 bits of the xmm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  XMM2  XMM0  XMM3  VMOVSS,                // XMM2[63:0]    -> XMM3[63:0]
//                                           // XMM0[127:64]  -> XMM3 [127:64]
//
// Note:
//  Intel docs say this moves double precision floating point values,
//   but it doesn't matter what kind of data is in the 32 bits.
//  There are two opcode sequences for this instruction. If you use reverse with
//   this instruction you will get the alternate opcode sequence.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovss2comma ( VMOVSS2, )
//
// C prototype:
//  void dg_forthvmovss2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   XMMR                         specifies an XMM register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVSS2 instruction. This opcode sequence moves a 32 bit value
//   from the low 32 bits of the source xmm register to the 32 bit memory
//   target destination. This opcode sequence can also move a 32 bit value
//   from a 32 bit memory target source to the low 32 bits of an xmm
//   register destination. Both target can not be memory, and both targets
//   can not be xmm registers. If the destination is an xmm register, the
//   upper bits of the destination xmm register are cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  VMOVSD2,             // XMM1[31:0]  -> [RAX][31:0]
//
//  XMM0  RCX [R]  VMOVSD2,             // XMM0[31:0]  -> [RCX][31:0]
//
//  RCX [R]  XMM0  VMOVSD2,             // [RCX][31:0] -> XMM0[31:0]
//                                      //           0 -> XMM0[127:32]
//
// Note:
//  Intel docs say this moves single precision floating point values,
//    but it doesn't matter what kind of data is in the 64 bits.
//  This is the VMOVSD xmm register to from memory instruction. I added the 2
//   because it uses a different number of targets than when only xmm registers
//   are used.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovlpdcomma ( MOVLPD, )
//
// C prototype:
//  void dg_forthmovlpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVLPD instruction. This opcode sequence moves a 64 bit value
//   from the source to the destination. One target must be an xmm register.
//   The other target must be a memory target. The data is moved from the
//   64 bit memory target to/from the low 64 bits of the xmm register target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  MOVLPD,             // XMM1[63:0] -> [RAX][63:0]
//
//  XMM0  RCX [R]  MOVLPD,             // XMM0[63:0] -> [RCX][63:0]
//
//  RAX [R]  XMM3  MOVLPD,             // [RAX][63:0] -> XMM3[63:0]
//
// Note:
//  Intel docs say this moves
//   a single double precision floating point value, but it doesn't matter
//   what kind of data is in the 64 bits. This compiling word uses a rex.w
//   prefix. If you want a shorter opcode sequence, use MOVLPS. It does the
//   same thing as this without the rex.w prefix.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovlpscomma ( MOVLPS, )
//
// C prototype:
//  void dg_forthmovlpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the target paremeter lists can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use this:
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVLPS instruction. This opcode sequence moves a 64 bit value
//   from the source to the destination. One target must be an xmm register.
//   The other target must be a memory target. The data is moved from the
//   64 bit memory target to/from the low 64 bits of the xmm register target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM1  RAX [R]  MOVLPS,             // XMM1[63:0] -> [RAX][63:0]
//
//  XMM0  RCX [R]  MOVLPS,             // XMM0[63:0] -> [RCX][63:0]
//
//  RAX [R]  XMM3  MOVLPS,             // [RAX][63:0] -> XMM3[63:0]
//
// Note:
//  The only difference between
//   this compiling word and MOVLPD is that this compiling word does not
//   use a rex.w prefix in the opcode sequence. Intel docs say this
//   opcode sequence moves two single floating point values, but it doesn't
//   matter what kind of data is in the 64 bits.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovmskpdcomma ( MOVMSKPD, )
//
// C prototype:
//  void dg_forthmovmskpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   R                            specifies a register target.
//                                 R is optional
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVMSKPD instruction. This opcode sequence extracts the sign bits
//   from the two double floating point values in the source and puts the two
//   bits into the lower two bits of the destination register.
//   For this compiling word the size of the destination register is ignored.
//   No matter what you specify, the two bits are zero extended to the current
//   address mode size. Also, the values don't have to be floating point. Any
//   64 bit values will work.

//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit example:
//  XMM1  AL  MOVMSKPD,             // XMM1[63]  -> EAX[0]
//                                  // XMM1[127] -> EAX[1]
//                                  //         0 -> EAX[31:2]
//
// 64 bit example
//  XMM3  CL  MOVMSKPD,             // XMM3[63]  -> RCX[0]
//                                  // XMM3[127] -> RCX[1]
//                                  //         0 -> RCX[63:2]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovmskpdcomma ( VMOVMSKPD, )
//
// C prototype:
//  void dg_forthvmovmskpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   targetxmmregister XMMR
//   targetymmregister YMMR
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//
//   R                            specifies a register target.
//                                 R is optional
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVMSKPD instruction. This opcode sequence extracts the sign bits
//   from two or four double floating point values in the source and puts the 
//   two or four bits into the lower two or four bits of the destination 
//   register.
//   The two or four bits are zero extended to the current address mode size
//   regardless of what register size you specify. Also, the values don't 
//   have to be floating point. Any 64 bit values will work.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  XMM1  AL  VMOVMSKPD,            // XMM1[63]  -> EAX[0]
//                                  // XMM1[127] -> EAX[1]
//                                  //         0 -> EAX[31:2]
//
// 64 bit address mode example
//  YMM3  CL  VMOVMSKPD,            // YMM3[63]  -> RCX[0]
//                                  // YMM3[127] -> RCX[1]
//                                  // YMM3[191] -> RCX[2]
//                                  // YMM3[255] -> RCX[3]
//                                  //         0 -> RCX[63:4]
//
// Note:
//  This compiling word uses the size of the destination register to determine
//   whether or not to set the vex.w bit, but the opcode sequence ignores the
//   state of this vex.w. Other than that, the size of the destination register
//   is ignored.
//  The source must be an xmm or ymm register.
//  The destination must be a regular register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovmskpscomma ( MOVMSKPS, )
//
// C prototype:
//  void dg_forthmovmskpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   R                            specifies a register target.
//                                 R is optional
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVMSKPS instruction. This opcode sequence extracts the sign bits
//   from the four single floating point values in the source and puts the four
//   bits into the lower four bits of the destination register.
//   For this compiling word the size of the destination register is ignored.
//   No matter what you specify, the four bits are zero extended to the current
//   address mode size. Also, the values don't have to be floating point. Any
//   32 bit values will work.

//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit example:
//  XMM1  AL  MOVMSKPS,             // XMM1[31]  -> EAX[0]
//                                  // XMM1[63]  -> EAX[1]
//                                  // XMM1[95]  -> EAX[2]
//                                  // XMM1[127] -> EAX[3]
//                                  //         0 -> EAX[31:4]
//
// 64 bit example
//  XMM3  CL  MOVMSKPS,             // XMM3[31]  -> RCX[0]
//                                  // XMM3[65]  -> RCX[1]
//                                  // XMM3[95]  -> RCX[2]
//                                  // XMM3[127] -> RCX[3]
//                                  //         0 -> RCX[63:4]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovmskpscomma ( VMOVMSKPS, )
//
// C prototype:
//  void dg_forthVmovmskpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   targetxmmregister XMMR
//   targetymmregister YMMR
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//
//   R                            specifies a register target.
//                                 R is optional
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVMSKPS instruction. This opcode sequence extracts the sign bits
//   from the four or eight single floating point values in the source and puts 
//   the four or eight bits into the lower four or eight bits of the destination 
//   register.
//   The four or eight bits are zero extended to the current address mode size
//   regardless of what register size you specify. Also, the values don't 
//   have to be floating point. Any 32 bit values will work.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit example:
//  XMM1  AL  VMOVMSKPS,            // XMM1[31]  -> EAX[0]
//                                  // XMM1[63]  -> EAX[1]
//                                  // XMM1[95]  -> EAX[2]
//                                  // XMM1[127] -> EAX[3]
//                                  //         0 -> EAX[31:4]
//
// 64 bit example
//  YMM3  CL  VMOVMSKPS,            // YMM3[31]  -> RCX[0]
//                                  // YMM3[65]  -> RCX[1]
//                                  // YMM3[95]  -> RCX[2]
//                                  // YMM3[127] -> RCX[3]
//                                  // YMM3[159] -> RCX[4]
//                                  // YMM3[191] -> RCX[5]
//                                  // YMM3[223] -> RCX[6]
//                                  // YMM3[255] -> RCX[7]
//                                  //         0 -> RCX[63:8]
//
// Note:
//  This compiling word uses the size of the destination register to determine
//   whether or not to set the vex.w bit, but the opcode sequence ignores the
//   state of this vex.w. Other than that, the size of the destination register
//   is ignored.
//  The source must be an xmm or ymm register.
//  The destination must be a regular register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovnticomma ( MOVNTI, )
//
// C prototype:
//  void dg_forthmovnticomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for a memory target, (you don't need to)
//   you can use these:
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 0, 4 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVNTI instruction. This opcode sequence moves a 32 bit or 64 bit
//   value from the source register to the destination memory target using a
//   non temporal hint. This means the write does not put the value into
//   the cache hierarchy among other things. The size of the register you
//   specify determines the data size of the write. If you specify a data
//   size for the memory target, it must match the data size of the register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   ECX  RAX [R]  MOVNTI,   // ECX -> [RAX][31:0]
//
//   RAX  RCX [R]  MOVNTI,   // RAX -> [RCX][63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovntdqcomma ( MOVNTDQ, )
//
// C prototype:
//  void dg_forthmovntdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVNTDQ instruction. This opcode sequence moves a 128 bit
//   value from the source xmm register to the destination memory
//   target using a non temporal hint. This means the write does not put the
//   value into the cache hierarchy among other things. I haven't added the
//   data size specifier 128BIT yet so you can't explicitely declare the
//   data size of the memory target yet. J.N. 3/29/2020.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   XMM1  RAX [R]  MOVNTDQ,  // XMM1 -> [RAX][127:0]
//
//   XMM3  RCX [R]  MOVNTDQ,  // XMM3 -> [RCX][127:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovntdqcomma ( VMOVNTDQ, )
//
// C prototype:
//  void dg_forthvmovntdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   XMMR                         specifies an xmmr register target.
//   YMMR                         specifies a ymmr register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVNTDQ instruction. This opcode sequence moves a 128 bit or 256
//   bit value from the source xmm or ymm register to the destination memory
//   target using a non temporal hint. This means the write does not put the
//   value into the cache hierarchy among other things.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   XMM1  RAX [R]  VMOVNTDQ,  // XMM1 -> [RAX][127:0]
//
//   XMM3  RCX [R]  VMOVNTDQ,  // XMM3 -> [RCX][127:0]
//
//   YMM3  RCX [R]  VMOVNTDQ,  // YMM3 -> [RCX][255:0]
//
// Note:
//  The source must be an xmm or ymm register.
//  The destination must be a memory target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovntdqacomma ( MOVNTDQA, )
//
// C prototype:
//  void dg_forthmovntdqacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVNTDQ instruction. This opcode sequence moves a 128 bit
//   value from the source memory target to an xmm register target using a
//   non temporal hint if the memory is a write combining type. The memory
//   target must be aligned on a 128 bit boundary.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   RAX [R]  XMM1  MOVNTDQ,  // [RAX][127:0] -> XMM1
//
//   RCX [R]  XMM3  MOVNTDQ,  // [RCX][127:0] -> XMM3
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovntdqacomma ( VMOVNTDQA, )
//
// C prototype:
//  void dg_forthvmovntdqacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing
//    mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   XMMR                         specifies an xmmr register target.
//   YMMR                         specifies a ymmr register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVNTDQA instruction. This opcode sequence moves a 128 bit or 256
//   bit value from the source memory target to the destination xmm or ymm  
//   register target which may use a non temporal hint. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   RAX [R]  XMM1  VMOVNTDQA,  // [RAX][127:0] -> XMM1
//
//   RCX [R]  XMM3  VMOVNTDQA,  // [RCX][127:0] -> XMM3
//
//   RCX [R]  YMM3  VMOVNTDQA,  // [RCX][255:0] -> YMM3
//
// Note:
//  The destination must be an xmm or ymm register.
//  The source must be a memory target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovntpdcomma ( MOVNTPD, )
//
// C prototype:
//  void dg_forthmovntpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVNTPD instruction. This opcode sequence moves a 128 bit
//   value from the source xmm register to the destination memory
//   target using a non temporal hint. This means the write does not put the
//   value into the cache hierarchy among other things. I haven't added the
//   data size specifier 128BIT yet so you can't explicitly declare the
//   data size of the memory target yet. J.N. 3/29/2020. Intel docs say
//   this instruction acts on two packed double floating point values but
//   it probably does not matter if they are floating bit values. Any 128 bit
//   number will work.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   XMM1  RAX [R]  MOVNTPD,  // XMM1 -> [RAX][127:0]
//
//   XMM3  RCX [R]  MOVNTPD,  // XMM3 -> [RCX][127:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovntpdcomma ( VMOVNTPD, )
//
// C prototype:
//  void dg_forthvmovntpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   XMMR                         specifies an xmm register target.
//   YMMR                         specifies a ymm register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVNTPD instruction. This opcode sequence moves a 128 bit or 256
//   bit value from the source xmm or ymm register to the destination memory
//   target using a non temporal hint. This means the write does not put the
//   value into the cache hierarchy among other things.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   XMM1  RAX [R]  VMOVNTPD,  // XMM1 -> [RAX][127:0]
//
//   XMM3  RCX [R]  VMOVNTPD,  // XMM3 -> [RCX][127:0]
//
//   YMM3  RCX [R]  VMOVNTPD,  // YMM3 -> [RCX][255:0]
//
// Note:
//  The source must be an xmm or ymm register.
//  The destination must be a memory target.
//  Using XMMR or YMMR to specify source target will force three byte vex
//   encoding.
//  Intel docs say this instruction acts on two packed double floating point
//   values but it probably does not matter if they are floating bit values.
//   Any 128 bit or 256 bit number will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovntpscomma ( MOVNTPS, )
//
// C prototype:
//  void dg_forthmovntpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVNTPS instruction. This opcode sequence moves a 128 bit
//   value from the source xmm register to the destination memory
//   target using a non temporal hint. This means the write does not put the
//   value into the cache hierarchy among other things. I haven't added the
//   data size specifier 128BIT yet so you can't explicitly declare the
//   data size of the memory target yet. J.N. 3/29/2020. Intel docs say
//   this instruction acts on four packed single floating point values but
//   it probably does not matter if they are floating bit values. Any 128 bit
//   number will work.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   XMM1  RAX [R]  MOVNTPS,  // XMM1 -> [RAX][127:0]
//
//   XMM3  RCX [R]  MOVNTPS,  // XMM3 -> [RCX][127:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmovntpscomma ( VMOVNTPS, )
//
// C prototype:
//  void dg_forthvmovntpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   XMMR                         specifies an xmm register target.
//   YMMR                         specifies a ymm register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VMOVNTPS instruction. This opcode sequence moves a 128 bit or 256
//   bit value from the source xmm or ymm register to the destination memory
//   target using a non temporal hint. This means the write does not put the
//   value into the cache hierarchy among other things.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   XMM1  RAX [R]  VMOVNTPS,  // XMM1 -> [RAX][127:0]
//
//   XMM3  RCX [R]  VMOVNTPS,  // XMM3 -> [RCX][127:0]
//
//   YMM3  RCX [R]  VMOVNTPS,  // YMM3 -> [RCX][255:0]
//
// Note:
//  The source must be an xmm or ymm register.
//  The destination must be a memory target.
//  Using XMMR or YMMR to specify source target will force three byte vex
//   encoding.
//  Intel docs saythis instruction acts on four packed single floating point
//   values but it probably does not matter if they are floating bit values.
//   Any 128 bit or 256 bit number will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovqcomma ( MOVQ, )
//
// C prototype:
//  void dg_forthmovqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVQ instruction. This opcode sequence copies a 64 bit value from
//   the source to the destination. Only one target can be a memory target.
//   If neither target is a memory target, both targets must be the same type
//   of register. If the source target is an xmm register, the low 64 bits
//   of the xmm register are copied. If the destination register is an xmm
//   register, the value is zero extended to 128 bits.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  MOVQ,  // [RBX][63:0] -> XMM1[63:0]
//                        //           0 -> XMM1[127:64]
//
//  RAX [R]  ST3   MOVQ,  // [RAX][63:0] -> ST3[63:0]
//
//  XMM2  RCX [R]  MOVQ,  // XMM2[63:0]  -> [RCX][63:0]
//
//  ST5  RDX [R]  MOVQ,   // ST5[63:0]   -> [RDX][63:0]
//
//  XMM2  XMM1  MOVQ,     // XMM2[63:0]  -> XMM1[63:0]
//                        //          0  -> XMM1[127:64]
//
//  ST2  ST1 MOVQ,        // ST2[63:0]   -> ST1[63:0]
//
//  ST2  ST1  <-  MOVQ, // ST1[63:0]  -> ST2[63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovq2dqcomma ( MOVQ2DQ, )
//
// C prototype:
//  void dg_forthmovq2dqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVQ2DQ instruction. This opcode sequence moves a 64 bit value
//   from the source to the lower 64 bits of the destination. The source
//   target must be a floating point register. The destination target must be
//   an xmm register. The upper 64 bits of the xmm register are cleared.
//   Reverse is not supported.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  ST0  XMM1  MOVQ2DQ,             // ST0 -> XMM1[63:0]
//                                  //   0 -> XMM1[127:64]
//
//  ST3  XMM0  MOVQ2DQ,             // ST3 -> XMM0[63:0]
//                                  //   0 -> XMM0[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsd2comma ( MOVSD2, )
//
// C prototype:
//  void dg_forthmovsd2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 MOVSD2 instruction. This opcode sequence copies a 64 bit value
//   from the source to the destination. One of the targets can be
//   an xmm register, the other can be an xmm register or memory. I do not
//   know if memory targets need to be aligned on a 128 bit boundary.
//   If the source is an XMM register, only the lower 64 bits are used.
//   If the destination is an XMM register, the value is copied to the lower
//   64 bits and the upper 64 bits may be cleared or left unchanged
//   depending on the source type; if the source was memory, then the upper
//   64 bits are cleared, otherwise they are left unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  MOVSD2,     // [RBX][63:0] -> XMM0[63:0]
//                             //           0 -> XMM0[127:64]
//
//  XMM2  XMM0  MOVSD2,        // XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <- XMM0  MOVSD2,   // XMM0[63:0] -> XMM2[63:0]
//
//  XMM0 XMM8 MOVSD2,          // XMM0[63:0] -> XMM8[63:0]
//
//  XMM0  RBX [R]  MOVSD2,     // XMM0[63:0] -> [RBX][63:0]
//
// Note:
//  Only 1 target can be a memory target.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthorcomma ( OR, )
//
// C prototype:
//  void dg_forthorcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. If N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 OR instruction. This sequence binary ors the destination target
//    with the source target and stores the result in the destination target,
//    changing the condition code flags accordingly.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  OR,  // ors 12348000 with EAX to EAX
//  27 N  CL  OR,             // ors 27 with CL to CL
//  AX  EBX [R]  OR,          // ors AX with the 16 bit memory at the address
//                            //  in EBX to the 16 bit memory at the address
//                            //  in EBX
//  38 N  EDX [R]  32BIT OR,  // size required, ors 38 with the 32 bit memory
//                            //  at the address in EDX to the 32 bit memory
//                            //  at the address in EDX
//  ECX  EAX  OR,             // ors ECX with EAX to EAX
//  ECX <- EAX  OR,        // ors EAX with ECX to ECX
//  -38 N  RAX R  OR,         // ors -38 with RAX using the n8 to n64 modr/m 
//                            //  sign extended encoding
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem OR, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] OR,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some strange
//   things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthorpdcomma ( ORPD, )
//
// C prototype:
//  void dg_forthorpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ORPD instruction. Intel docs say this opcode sequence does a bitwise
//   or of the two double floating point values in the destination target with the
//   two double floating point values in the source target and puts the results
//   in the destination target. It's really just the same thing as a 128 bit
//   binary or.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ORPD,      // [RBX] or XMM0 -> XMM0
//  XMM2  XMM0  ORPD,         // XMM2 or XMM0 -> XMM0
//  XMM2 <- XMM0  ORPD,    // XMM0 or XMM2 -> XMM2
//  XMM0 XMM8 ORPD,           // XMM0 or XMM8 -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvorpdcomma ( VORPD, )
//
// C prototype:
//  void dg_forthvorpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VORPD instruction. Intel docs say this opcode sequence does a bitwise
//   or of the two double floating point values in target y with the
//   two double floating point values in the source target and puts the results
//   in the destination target. It's really just the same thing as a 128 bit
//   or 256 bit binary or.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VORPD,      // [RBX] or XMM1 -> XMM0
//  XMM2  XMM1  XMM0  VORPD,         // XMM2 or XMM1 -> XMM0
//  XMM2 <- XMM1  XMM0  VORPD,    // XMM0 or XMM1 -> XMM2
//  YMM0 YMM1  YMM8 VORPD,           // YMM0 or YMM1 -> YMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthorpscomma ( ORPS, )
//
// C prototype:
//  void dg_forthorpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 ORPS instruction. Intel docs say this opcode sequence does a bitwise
//   or of the four single floating point values in the destination target with
//   the four single floating point values in the source target and puts the
//   results in the destination target. It's really just a 128 bit binary or.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  ORPS,      // [RBX] or XMM0 -> XMM0
//  XMM2  XMM0  ORPS,         // XMM2 or XMM0 -> XMM0
//  XMM2 <- XMM0  ORPS,    // XMM0 or XMM2 -> XMM2
//  XMM0 XMM8 ORPS,           // XMM0 or XMM8 -> XMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvorpscomma ( VORPS, )
//
// C prototype:
//  void dg_forthvorpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VORPS instruction. Intel docs say this opcode sequence does a bitwise
//   or of the four or eight single floating point values in target y with the
//   four or eight single floating point values in the source target and puts the 
//   results in the destination target. It's really just the same thing as a 128 
//   bit or 256 bit binary or.
//   It looks like the flags are not modified.
//   No floating point exceptions are generated.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VORPS,      // [RBX] or XMM1 -> XMM0
//  XMM2  XMM1  XMM0  VORPS,         // XMM2 or XMM1 -> XMM0
//  XMM2 <- XMM1  XMM0  VORPS,    // XMM0 or XMM1 -> XMM2
//  YMM0 YMM1  YMM8 VORPS,           // YMM0 or YMM1 -> YMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpackusdwcomma ( PACKUSDW, )
//
// C prototype:
//  void dg_forthpackusdwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PACKUSDW instruction. This sequence converts two signed 64 integers
//   from the destination into two unsigned 32 bit integers and stores them
//   into the lower 64 bits of the destination. This sequence also coverts two
//   signed 64 bit integers from the source into two unsigned 32 bit integers
//   and stores them in the upper 64 bits of the destination. If the number to
//   be converted is < 0 then it is changed to 0. If the number to be
//   converted is > 0xFFFF, it is changed to 0xFFFF.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  PACKUSDW,     // XMM0[63:0]    INT64->UINT32 -> XMM0[31:0]
//                               // XMM0[127:64]  INT64->UINT32 -> XMM0[63:32]
//                               // [RBX][63:0]   INT64->UINT32 -> XMM0[95:64]
//                               // [RBX][127:64] INT64->UINT32 -> XMM0[127:96]
//
//  XMM2  XMM0  PACKUSDW,        // XMM0[63:0]    INT64->UINT32 -> XMM0[31:0]
//                               // XMM0[127:64]  INT64->UINT32 -> XMM0[63:32]
//                               // XMM2[63:0]    INT64->UINT32 -> XMM0[95:64]
//                               // XMM2[127:64]  INT64->UINT32 -> XMM0[127:96]
//
//  XMM2 <-  XMM0  PACKUSDW,  // XMM2[63:0]    INT64->UINT32 -> XMM2[31:0]
//                               // XMM2[127:64]  INT64->UINT32 -> XMM2[63:32]
//                               // XMM0[63:0]    INT64->UINT32 -> XMM2[95:64]
//                               // XMM0[127:64]  INT64->UINT32 -> XMM2[127:96]
//
//  XMM0  XMM8  PACKUSDW,        // XMM8[63:0]    INT64->UINT32 -> XMM8[31:0]
//                               // XMM8[127:64]  INT64->UINT32 -> XMM8[63:32]
//                               // XMM0[63:0]    INT64->UINT32 -> XMM8[95:64]
//                               // XMM0[127:64]  INT64->UINT32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpackusdwcomma ( VPACKUSDW, )
//
// C prototype:
//  void dg_forthvpackusdwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPACKUSDW instruction. This opcode sequence gets each unsigned 32 bit
//   integer from the source and each unsigned 32 bit integer from target y
//   converts them into unsigned 16 bit integers and puts the 
//   results into the destination. The results from target y go into the lower 
//   half of each 128 bit section of the destination. If the result of a 
//   conversion is outside the range of an unsigned 16 bit integer, the result 
//   gets clipped to the maximum unsigned 16 bit value possible.
//   If the source target is not a memory target, all three targets must be the 
//   same type of register. The destination and target y can not be a memory 
//   target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM2  XMM1  VPACKUSDW,  
//                          // XMM2[31:0]    UINT16->UINT8   -> XMM1[15:0]
//                          // XMM2[63:32]   UINT16->UINT8   -> XMM1[31:16]
//                          // XMM2[95:64]   UINT16->UINT8   -> XMM1[47:32]
//                          // XMM2[127:96]  UINT16->UINT8   -> XMM1[63:48]
//                          // [RBX][31:0]   UINT16->UINT8   -> XMM1[79:64]
//                          // [RBX][63:32]  UINT16->UINT8   -> XMM1[95:80]
//                          // [RBX][95:64]  UINT16->UINT8   -> XMM1[111:96]
//                          // [RBX][127:96] UINT16->UINT8   -> XMM1[127:112]
//
//
//  YMM2  YMM3  YMM1  VPACKUSDW,     
//                          // YMM3[31:0]    UINT16->UINT8  -> YMM1[15:0]
//                          // YMM3[63:32]   UINT16->UINT8  -> YMM1[31:16]
//                          // YMM3[95:64]   UINT16->UINT8  -> YMM1[47:32]
//                          // YMM3[127:96]  UINT16->UINT8  -> YMM1[63:48]
//                          // YMM2[31:0]    UINT16->UINT8  -> YMM1[79:64]
//                          // YMM2[63:32]   UINT16->UINT8  -> YMM1[95:80]
//                          // YMM2[95:64]   UINT16->UINT8  -> YMM1[111:96]
//                          // YMM2[127:96]  UINT16->UINT8  -> YMM1[143:128]
//                          // YMM3[159:128] UINT16->UINT8  -> YMM1[143:128]
//                          // YMM3[191:160] UINT16->UINT8  -> YMM1[159:144]
//                          // YMM3[223:192] UINT16->UINT8  -> YMM1[175:160]
//                          // YMM3[255:224] UINT16->UINT8  -> YMM1[191:176]
//                          // YMM2[159:128] UINT16->UINT8  -> YMM1[207:192]
//                          // YMM2[191:160] UINT16->UINT8  -> YMM1[223:208]
//                          // YMM2[223:192] UINT16->UINT8  -> YMM1[239:224]
//                          // YMM2[255:224] UINT16->UINT8  -> YMM1[255:240]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpaddbcomma ( PADDB, )
//
// C prototype:
//  void dg_forthpaddbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDB instruction. This sequence adds each byte integer in
//   the source to the corresponding byte integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDB,    // XMM1[7:0]     + [RBX][7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    + [RBX][15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   + [RBX][23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] + [RBX][127:120] -> XMM1[127:120]
//
//  RBX [R]  ST1  PADDB,     // ST1[7:0]     + [RBX][7:0]     -> ST1[7:0]
//                           // ST1[15:8]    + [RBX][15:8]    -> ST1[15:8]
//                           // ST1[23:16]   + [RBX][23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   + [RBX][63:56]   -> ST1[63:56]
//
//  XMM2  XMM1  PADDB,       // XMM1[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] + XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PADDB,         // ST1[7:0]     + ST2[7:0]     -> ST1[7:0]
//                           // ST1[15:8]    + ST2[15:8]    -> ST1[15:8]
//                           // ST1[23:16]   + ST2[23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   + ST2[63:56]   -> ST1[63:56]
//
//  XMM1 <-  XMM2  PADDB, // XMM1[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] + XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpaddbcomma ( VPADDB, )
//
// C prototype:
//  void dg_forthvpaddbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDB instruction. This sequence adds each byte integer in
//   the source to the corresponding byte integer in target y and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDB,    
//                           // XMM0[7:0]     + [RBX][7:0]     -> XMM1[7:0]
//                           // XMM0[15:8]    + [RBX][15:8]    -> XMM1[15:8]
//                           // XMM0[23:16]   + [RBX][23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM0[127:120] + [RBX][127:120] -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  PADDB,       
//                           // YMM2[7:0]     + YMM2[7:0]     -> YMM1[7:0]
//                           // YMM2[15:8]    + YMM2[15:8]    -> YMM1[15:8]
//                           // YMM2[23:16]   + YMM2[23:16]   -> YMM1[23:16]
//                           // ...
//                           // YMM2[255:248] + YMM2[255:248] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  PADDB, 
//                           // XMM0[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                           // XMM0[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                           // XMM0[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM0[127:120] + XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpadddcomma ( PADDD, )
//
// C prototype:
//  void dg_forthpadddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDD instruction. This sequence adds each 32 bit integer in
//   the source to the corresponding 32 bit integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDD,    // XMM1[31:0]     + [RBX][31:0]     -> XMM1[31:0]
//                           // XMM1[63:32]    + [RBX][63:32]    -> XMM1[63:32]
//                           // XMM1[95:64]    + [RBX][95:64]    -> XMM1[95:64]
//                           // XMM1[127:96]   + [RBX][127:96]   -> XMM1[127:96]
//
//  RBX [R]  ST1  PADDD,     // ST1[31:0]     + [RBX][31:0]     -> ST1[31:0]
//                           // ST1[63:32]    + [RBX][63:32]    -> ST1[63:32]
//
//  XMM2  XMM1  PADDD,       // XMM1[31:0]     + XMM2[31:0]     -> XMM1[31:0]
//                           // XMM1[63:32]    + XMM2[63:32]    -> XMM1[63:32]
//                           // XMM1[95:64]    + XMM2[95:64]    -> XMM1[95:64]
//                           // XMM1[127:96]   + XMM2[127:96]   -> XMM1[127:96]
//
//  ST2  ST1  PADDD,         // ST1[31:0]     + ST2[31:0]     -> ST1[31:0]
//                           // ST1[63:32]    + ST2[63:32]    -> ST1[63:32]
//
//  XMM1 <-  XMM2  PADDD, // XMM1[31:0]     + XMM2[31:0]     -> XMM1[31:0]
//                           // XMM1[63:32]    + XMM2[63:32]    -> XMM1[63:32]
//                           // XMM1[95:64]    + XMM2[95:64]    -> XMM1[95:64]
//                           // XMM1[127:96]   + XMM2[127:96]   -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpadddcomma ( VPADDD, )
//
// C prototype:
//  void dg_forthvpadddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDD instruction. This sequence adds each 32 bit integer in
//   the source to the corresponding 32 bit integer in target y and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDD,    
//                           // XMM0[31:0]     + [RBX][31:0]     -> XMM1[31:0]
//                           // XMM0[63:32]    + [RBX][63:32]    -> XMM1[63:32]
//                           // XMM0[95:64]    + [RBX][95:64]    -> XMM1[95:64]
//                           // XMM0[127:96]   + [RBX][127:96]   -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPADDD, // YMM0[31:0]     + YMM2[31:0]     -> YMM1[31:0]
//                           // YMM0[63:32]    + YMM2[63:32]    -> YMM1[63:32]
//                           // YMM0[95:64]    + YMM2[95:64]    -> YMM1[95:64]
//                           // YMM0[127:96]   + YMM2[127:96]   -> YMM1[127:96]
//                           // YMM0[159:128]  + YMM2[159:128]  -> YMM1[159:128]
//                           // YMM0[191:160]  + YMM2[191:160]  -> YMM1[191:160]
//                           // YMM0[223:192]  + YMM2[223:192]  -> YMM1[223:192]
//                           // YMM0[255:224]  + YMM2[255:224]  -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPADDD, 
//                           // XMM0[31:0]     + XMM2[31:0]     -> XMM1[31:0]
//                           // XMM0[63:32]    + XMM2[63:32]    -> XMM1[63:32]
//                           // XMM0[95:64]    + XMM2[95:64]    -> XMM1[95:64]
//                           // XMM0[127:96]   + XMM2[127:96]   -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpaddqcomma ( PADDQ, )
//
// C prototype:
//  void dg_forthpaddqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDQ instruction. For floating point destinations, this sequence
//   adds the 64 bit integer in the source to the 64 bit integer in the
//   destination and puts the result into the destination.
//   For xmm destinations, this sequence adds each 64 bit integer in
//   the source from the corresponding 64 bit integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDQ,    // XMM1[63:0]     + [RBX][63:0]     -> XMM1[63:0]
//                           // XMM1[127:64]   + [RBX][127:64]   -> XMM1[127:64]
//
//  RBX [R]  ST1  PADDQ,     // ST1[63:0]     + [RBX][63:0]     -> ST1[63:0]
//
//  XMM2  XMM1  PADDQ,       // XMM1[63:0]     + XMM2[63:0]     -> XMM1[63:0]
//                           // XMM1[127:64]   + XMM2[127:64]   -> XMM1[127:64]
//
//  ST2  ST1  PADDQ,         // ST1[63:0]     + ST2[63:0]     -> ST1[63:0]
//
//  XMM1 <-  XMM2  PADDQ, // XMM1[63:0]     + XMM2[63:0]     -> XMM1[63:0]
//                           // XMM1[127:64]   + XMM2[127:64]   -> XMM1[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpaddqcomma ( VPADDQ, )
//
// C prototype:
//  void dg_forthvpaddqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDQ instruction. 
//   This sequence adds each 64 bit integer in the source from the corresponding 
//   64 bit integer in target y and puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDQ,    
//                           // XMM0[63:0]     + [RBX][63:0]     -> XMM1[63:0]
//                           // XMM1[127:64]   + [RBX][127:64]   -> XMM1[127:64]
//
//  YMM2  YMM0  YMM1  VPADDQ, // YMM0[63:0]     + YMM2[63:0]     -> YMM1[63:0]
//                            // YMM0[127:64]   + YMM2[127:64]   -> YMM1[127:64]
//                            // YMM0[191:128]  + YMM2[191:128]  -> YMM1[191:128]
//                            // YMM0[255:192]  + YMM2[255:192]  -> YMM1[255:192]
//
//  XMM1 <-  XMM0  XMM2  VPADDQ, 
//                           // XMM0[63:0]     + XMM2[63:0]     -> XMM1[63:0]
//                           // XMM0[127:64]   + XMM2[127:64]   -> XMM1[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpaddwcomma ( PADDW, )
//
// C prototype:
//  void dg_forthpaddwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDW instruction. This sequence adds each 16 bit integer in
//   the source to the corresponding 16 bit integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDW,    // XMM1[15:0]     + [RBX][15:0]     -> XMM1[15:0]
//                           // XMM1[31:16]    + [RBX][31:16]    -> XMM1[31:16]
//                           // XMM1[47:32]    + [RBX][47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM1[127:112]  + [RBX][127:112]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PADDW,     // ST1[15:0]     + [RBX][15:0]     -> ST1[15:0]
//                           // ST1[31:16]    + [RBX][31:16]    -> ST1[31:16]
//                           // ST1[47:32]    + [RBX][47:32]    -> ST1[47:32]
//                           // ST1[63:48]    + [RBX][63:48]    -> ST1[63:48]
//
//  XMM2  XMM1  PADDW,       // XMM1[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                           // XMM1[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                           // XMM1[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM1[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
//  ST2  ST1  PADDW,         // ST1[15:0]     + ST2[15:0]     -> ST1[15:0]
//                           // ST1[31:16]    + ST2[31:16]    -> ST1[31:16]
//                           // ST1[47:32]    + ST2[47:32]    -> ST1[47:32]
//                           // ST1[63:48]    + ST2[63:48]    -> ST1[63:48]
//
//  XMM1 <-  XMM2  PADDW, // XMM1[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                           // XMM1[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                           // XMM1[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM1[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpaddwcomma ( VPADDW, )
//
// C prototype:
//  void dg_forthvpaddwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//

// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDW instruction. This sequence adds each 16 bit integer in
//   the source to the corresponding 16 bit integer in target y and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDW,   
//                           // XMM0[15:0]     + [RBX][15:0]     -> XMM1[15:0]
//                           // XMM0[31:16]    + [RBX][31:16]    -> XMM1[31:16]
//                           // XMM0[47:32]    + [RBX][47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM0[127:112]  + [RBX][127:112]  -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPADDW,      
//                           // YMM0[15:0]     + YMM2[15:0]     -> YMM1[15:0]
//                           // YMM0[31:16]    + YMM2[31:16]    -> YMM1[31:16]
//                           // YMM0[47:32]    + YMM2[47:32]    -> YMM1[47:32]
//                           // ...
//                           // YMM0[255:240]  + YMM2[255:240]  -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPADDW, 
//                           // XMM0[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                           // XMM0[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                           // XMM0[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM0[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpaddsbcomma ( PADDSB, )
//
// C prototype:
//  void dg_forthpaddsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDSB instruction. This sequence adds each signed byte integer in
//   the source to the corresponding signed byte integer in the destination
//   and puts the result into the destination. If the result is too large to
//   fit into a signed byte, the result is clipped to the maximum or minimum
//   signed byte value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDSB,    // XMM1[7:0]     + [RBX][7:0]     -> XMM1[7:0]
//                            // XMM1[15:8]    + [RBX][15:8]    -> XMM1[15:8]
//                            // XMM1[23:16]   + [RBX][23:16]   -> XMM1[23:16]
//                            // ...
//                            // XMM1[127:120] + [RBX][127:120] -> XMM1[127:120]
//
//  RBX [R]  ST1  PADDSB,     // ST1[7:0]     + [RBX][7:0]     -> ST1[7:0]
//                            // ST1[15:8]    + [RBX][15:8]    -> ST1[15:8]
//                            // ST1[23:16]   + [RBX][23:16]   -> ST1[23:16]
//                            // ...
//                            // ST1[63:56]   + [RBX][63:56]   -> ST1[63:56]
//
//  XMM2  XMM1  PADDSB,       // XMM1[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                            // XMM1[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                            // XMM1[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                            // ...
//                            // XMM1[127:120] + XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PADDSB,         // ST1[7:0]     + ST2[7:0]     -> ST1[7:0]
//                            // ST1[15:8]    + ST2[15:8]    -> ST1[15:8]
//                            // ST1[23:16]   + ST2[23:16]   -> ST1[23:16]
//                            // ...
//                            // ST1[63:56]   + ST2[63:56]   -> ST1[63:56]
//
//  XMM1 <-  XMM2  PADDSB, // XMM1[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                            // XMM1[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                            // XMM1[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                            // ...
//                            // XMM1[127:120] + XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpaddsbcomma ( VPADDSB, )
//
// C prototype:
//  void dg_forthvpaddsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDSB instruction. This sequence adds each signed byte integer in
//   the source to the corresponding signed byte integer in target y and puts
//   the result into the destination. If the result is too large to
//   fit into a signed byte, the result is clipped to the maximum or minimum
//   signed byte value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDSB,    
//                           // XMM0[7:0]     + [RBX][7:0]     -> XMM1[7:0]
//                           // XMM0[15:8]    + [RBX][15:8]    -> XMM1[15:8]
//                           // XMM0[23:16]   + [RBX][23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM0[127:120] + [RBX][127:120] -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPADDSB,       
//                           // YMM2[7:0]     + YMM2[7:0]     -> YMM1[7:0]
//                           // YMM2[15:8]    + YMM2[15:8]    -> YMM1[15:8]
//                           // YMM2[23:16]   + YMM2[23:16]   -> YMM1[23:16]
//                           // ...
//                           // YMM2[255:248] + YMM2[255:248] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPADDSB, 
//                           // XMM0[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                           // XMM0[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                           // XMM0[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM0[127:120] + XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpaddswcomma ( PADDSW, )
//
// C prototype:
//  void dg_forthpaddswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDSW instruction. This sequence adds each signed 16 bit integer in
//   the source to the corresponding signed 16 bit integer in the destination
//   and puts the result into the destination. If the result of the signed
//   addition will not fit into a signed 16 bit integer, the result is clipped
//   to the maximum or minimum signed 16 bit integer value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDSW,    // XMM1[15:0]     + [RBX][15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    + [RBX][31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    + [RBX][47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112]  + [RBX][127:112]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PADDSW,     // ST1[15:0]     + [RBX][15:0]     -> ST1[15:0]
//                            // ST1[31:16]    + [RBX][31:16]    -> ST1[31:16]
//                            // ST1[47:32]    + [RBX][47:32]    -> ST1[47:32]
//                            // ST1[63:48]    + [RBX][63:48]    -> ST1[63:48]
//
//  XMM2  XMM1  PADDSW,       // XMM1[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
//  ST2  ST1  PADDSW,         // ST1[15:0]     + ST2[15:0]     -> ST1[15:0]
//                            // ST1[31:16]    + ST2[31:16]    -> ST1[31:16]
//                            // ST1[47:32]    + ST2[47:32]    -> ST1[47:32]
//                            // ST1[63:48]    + ST2[63:48]    -> ST1[63:48]
//
//  XMM1 <-  XMM2  PADDSW, // XMM1[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpaddswcomma ( VPADDSW, )
//
// C prototype:
//  void dg_forthvpaddswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDSW instruction. This sequence adds each 16 bit signed integer in
//   the source to the corresponding 16 bit signed integer in target y and puts
//   the result into the destination. If the result exceeds what would fit into
//   a signed 16 bit integer, the result is clipped to the largest signed 16 bit
//   value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDSW,   
//                           // XMM0[15:0]     + [RBX][15:0]     -> XMM1[15:0]
//                           // XMM0[31:16]    + [RBX][31:16]    -> XMM1[31:16]
//                           // XMM0[47:32]    + [RBX][47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM0[127:112]  + [RBX][127:112]  -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPADDSW,      
//                           // YMM0[15:0]     + YMM2[15:0]     -> YMM1[15:0]
//                           // YMM0[31:16]    + YMM2[31:16]    -> YMM1[31:16]
//                           // YMM0[47:32]    + YMM2[47:32]    -> YMM1[47:32]
//                           // ...
//                           // YMM0[255:240]  + YMM2[255:240]  -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPADDSW, 
//                           // XMM0[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                           // XMM0[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                           // XMM0[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM0[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpaddusbcomma ( PADDUSB, )
//
// C prototype:
//  void dg_forthpaddusbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDUSB instruction. This sequence adds each unsigned byte integer
//   in the source to the corresponding unsigned byte integer in the destination
//   and puts the result into the destination. If the result is too large to
//   fit into an unsigned byte, the result is clipped to the maximum
//   unsigned byte value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDUSB,    // XMM1[7:0]     + [RBX][7:0]     -> XMM1[7:0]
//                             // XMM1[15:8]    + [RBX][15:8]    -> XMM1[15:8]
//                             // XMM1[23:16]   + [RBX][23:16]   -> XMM1[23:16]
//                             // ...
//                             // XMM1[127:120] + [RBX][127:120] -> XMM1[127:120]
//
//  RBX [R]  ST1  PADDUSB,     // ST1[7:0]     + [RBX][7:0]     -> ST1[7:0]
//                             // ST1[15:8]    + [RBX][15:8]    -> ST1[15:8]
//                             // ST1[23:16]   + [RBX][23:16]   -> ST1[23:16]
//                             // ...
//                             // ST1[63:56]   + [RBX][63:56]   -> ST1[63:56]
//
//  XMM2  XMM1  PADDUSB,       // XMM1[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                             // XMM1[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                             // XMM1[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                             // ...
//                             // XMM1[127:120] + XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PADDUSB,         // ST1[7:0]     + ST2[7:0]     -> ST1[7:0]
//                             // ST1[15:8]    + ST2[15:8]    -> ST1[15:8]
//                             // ST1[23:16]   + ST2[23:16]   -> ST1[23:16]
//                             // ...
//                            // ST1[63:56]   + ST2[63:56]   -> ST1[63:56]
//
//  XMM1 <-  XMM2  PADDUSB, // XMM1[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                             // XMM1[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                             // XMM1[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                             // ...
//                             // XMM1[127:120] + XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpaddusbcomma ( VPADDUSB, )
//
// C prototype:
//  void dg_forthvpaddusbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDUSB instruction. This sequence adds each unsigned byte integer in
//   the source to the corresponding unsigned byte integer in target y and puts
//   the result into the destination. If the result is too large to
//   fit into an unsigned byte, the result is clipped to the maximum
//   unsigned byte value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDUSB,    
//                           // XMM0[7:0]     + [RBX][7:0]     -> XMM1[7:0]
//                           // XMM0[15:8]    + [RBX][15:8]    -> XMM1[15:8]
//                           // XMM0[23:16]   + [RBX][23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM0[127:120] + [RBX][127:120] -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPADDUSB,       
//                           // YMM2[7:0]     + YMM2[7:0]     -> YMM1[7:0]
//                           // YMM2[15:8]    + YMM2[15:8]    -> YMM1[15:8]
//                           // YMM2[23:16]   + YMM2[23:16]   -> YMM1[23:16]
//                           // ...
//                           // YMM2[255:248] + YMM2[255:248] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPADDUSB, 
//                           // XMM0[7:0]     + XMM2[7:0]     -> XMM1[7:0]
//                           // XMM0[15:8]    + XMM2[15:8]    -> XMM1[15:8]
//                           // XMM0[23:16]   + XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM0[127:120] + XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpadduswcomma ( PADDUSW, )
//
// C prototype:
//  void dg_forthpadduswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PADDUSW instruction. This sequence adds each unsigned 16 bit integer
//   in the source to the corresponding unsigned 16 bit integer in the destination
//   and puts the result into the destination. If the result of the unsigned
//   addition will not fit into an unsigned 16 bit integer, the result is clipped
//   to the maximum unsigned 16 bit integer value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PADDUSW,    // XMM1[15:0]     + [RBX][15:0]     -> XMM1[15:0]
//                             // XMM1[31:16]    + [RBX][31:16]    -> XMM1[31:16]
//                             // XMM1[47:32]    + [RBX][47:32]    -> XMM1[47:32]
//                             // ...
//                             // XMM1[127:112]  + [RBX][127:112]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PADDUSW,     // ST1[15:0]     + [RBX][15:0]     -> ST1[15:0]
//                             // ST1[31:16]    + [RBX][31:16]    -> ST1[31:16]
//                             // ST1[47:32]    + [RBX][47:32]    -> ST1[47:32]
//                             // ST1[63:48]    + [RBX][63:48]    -> ST1[63:48]
//
//  XMM2  XMM1  PADDUSW,       // XMM1[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                             // XMM1[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                             // XMM1[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                             // ...
//                             // XMM1[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
//  ST2  ST1  PADDUSW,         // ST1[15:0]     + ST2[15:0]     -> ST1[15:0]
//                             // ST1[31:16]    + ST2[31:16]    -> ST1[31:16]
//                             // ST1[47:32]    + ST2[47:32]    -> ST1[47:32]
//                             // ST1[63:48]    + ST2[63:48]    -> ST1[63:48]
//
//  XMM1 <-  XMM2  PADDUSW, // XMM1[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                             // XMM1[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                             // XMM1[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                             // ...
//                             // XMM1[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpadduswcomma ( VPADDUSW, )
//
// C prototype:
//  void dg_forthvpadduswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPADDUSW instruction. This sequence adds each 16 bit unsigned integer 
//   in the source to the corresponding 16 bit unsigned integer in target y and 
//   puts the result into the destination. If the result exceeds what would fit 
//   into an unsigned 16 bit integer, the result is clipped to the largest 
//   unsigned 16 bit value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPADDUSW,   
//                           // XMM0[15:0]     + [RBX][15:0]     -> XMM1[15:0]
//                           // XMM0[31:16]    + [RBX][31:16]    -> XMM1[31:16]
//                           // XMM0[47:32]    + [RBX][47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM0[127:112]  + [RBX][127:112]  -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPADDUSW,      
//                           // YMM0[15:0]     + YMM2[15:0]     -> YMM1[15:0]
//                           // YMM0[31:16]    + YMM2[31:16]    -> YMM1[31:16]
//                           // YMM0[47:32]    + YMM2[47:32]    -> YMM1[47:32]
//                           // ...
//                           // YMM0[255:240]  + YMM2[255:240]  -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPADDUSW, 
//                           // XMM0[15:0]     + XMM2[15:0]     -> XMM1[15:0]
//                           // XMM0[31:16]    + XMM2[31:16]    -> XMM1[31:16]
//                           // XMM0[47:32]    + XMM2[47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM0[127:112]  + XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpandcomma ( PAND, )
//
// C prototype:
//  void dg_forthpandcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PAND instruction. This sequence does a binary and of the two targets
//   and puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PAND,    // [RBX][127:0] and XMM1[127:0] -> XMM1[127:0]
//
//  RBX [R]  ST1  PAND,     // [RBX][63:0] and ST1[63:0] -> ST1[63:0]
//
//  XMM2  XMM1  PAND,       // XMM2[127:0] and XMM1[127:0] -> XMM1[127:0]
//
//  ST2  ST1  PAND,         // ST2[63:0] and ST1[63:0] -> ST1[63:0]
//
//  XMM1 <-  XMM2  PAND, // XMM2[127:0] and XMM1[127:0] -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpandcomma ( VPAND, )
//
// C prototype:
//  void dg_forthvpandcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPAND instruction. This opcode sequence does a bitwise
//   and of the value in target y with the value in the source target and puts 
//   the result into the destination target. 
//   It looks like the flags are not modified.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPAND,      // [RBX] and XMM1 -> XMM0
//  XMM2  XMM1  XMM0  VPAND,         // XMM2 and  XMM1 -> XMM0
//  XMM2 <- XMM1  XMM0  VPAND,    // XMM0 and  XMM1 -> XMM2
//  YMM0  YMM1  YMM8 VPAND,          // YMM0 and  YMM1 -> YMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpandncomma ( PANDN, )
//
// C prototype:
//  void dg_forthpandncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PANDN instruction. This sequence does a binary inversion of the
//   destination then binary ands the result with the source target and puts
//   the result into the destination. (A binary inversion is where you flip
//   all the bits.)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PANDN,    // [RBX][127:0] and (not XMM1[127:0])
//                               -> XMM1[127:0]
//
//  RBX [R]  ST1  PANDN,     // [RBX][63:0] and (not ST1[63:0])
//                               -> ST1[63:0]
//
//  XMM2  XMM1  PANDN,       // XMM2[127:0] and (not XMM1[127:0])
//                               -> XMM1[127:0]
//
//  ST2  ST1  PANDN,         // ST2[63:0] and (not ST1[63:0])
//                               -> ST1[63:0]
//
//  XMM1 <-  XMM2  PANDN, // XMM2[127:0] and (not XMM1[127:0])
//                               -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
//
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpandncomma ( VPANDN, )
//
// C prototype:
//  void dg_forthvpandncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPANDN instruction. This opcode sequence does a binary inversion of
//   the value in target y then does a bitwise and with the result of the
//   inversion with the value in the source target and puts the result into the 
//   destination target. 
//   The flags are not modified.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPANDN,      // [RBX] and (not XMM1) -> XMM0
//  XMM2  XMM1  XMM0  VPANDN,         // XMM2 and  (not XMM1) -> XMM0
//  XMM2 <- XMM1  XMM0  VPANDN,    // XMM0 and  (not XMM1) -> XMM2
//  YMM0  YMM1  YMM8 VPANDN,          // YMM0 and  (not YMM1) -> YMM8
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpabsbcomma ( PABSB, )
//
// C prototype:
//  void dg_forthpabsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PABSB instruction. This opcode sequence gets the absolute value of
//   each signed 8 bit integer in the memory, xmm register, or floating point
//   register source and puts the result into the xmm register or floating
//   point register destination. If the source target is not a memory target,
//   both targets must be the same type of register. The destination can not
//   be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PABSB,   // abs( [RBX][7:0] )     -> XMM1[7:0]
//                          // abs( [RBX][15:8] )    -> XMM1[15:8]
//                          // abs( [RBX][23:16] )   -> XMM1[23:16]
//                          // abs( [RBX][31:24] )   -> XMM1[31:24]
//                          // abs( [RBX][39:32] )   -> XMM1[39:32]
//                          // abs( [RBX][47:40] )   -> XMM1[47:40]
//                          // abs( [RBX][55:48] )   -> XMM1[55:48]
//                          // abs( [RBX][63:56] )   -> XMM1[63:56]
//                          // abs( [RBX][71:64] )   -> XMM1[71:64]
//                          // abs( [RBX][79:72] )   -> XMM1[79:72]
//                          // abs( [RBX][87:80] )   -> XMM1[87:80]
//                          // abs( [RBX][95:88] )   -> XMM1[95:88]
//                          // abs( [RBX][103:96] )  -> XMM1[103:96]
//                          // abs( [RBX][111:104] ) -> XMM1[111:104]
//                          // abs( [RBX][119:112] ) -> XMM1[119:112]
//                          // abs( [RBX][127:120] ) -> XMM1[127:120]
//
//  RAX [R]  ST3   PABSB,   // abs( [RAX][7:0] )     -> ST3[7:0]
//                          // abs( [RAX][15:8] )    -> ST3[15:8]
//                          // abs( [RAX][23:16] )   -> ST3[23:16]
//                          // abs( [RAX][31:24] )   -> ST3[31:24]
//                          // abs( [RAX][39:32] )   -> ST3[39:32]
//                          // abs( [RAX][47:40] )   -> ST3[47:40]
//                          // abs( [RAX][55:48] )   -> ST3[55:48]
//                          // abs( [RAX][63:56] )   -> ST3[63:56]
//
//  XMM2  XMM1  PABSB,      // abs( XMM2[7:0] )     -> XMM1[7:0]
//                          // abs( XMM2[15:8] )    -> XMM1[15:8]
//                          // abs( XMM2[23:16] )   -> XMM1[23:16]
//                          // abs( XMM2[31:24] )   -> XMM1[31:24]
//                          // abs( XMM2[39:32] )   -> XMM1[39:32]
//                          // abs( XMM2[47:40] )   -> XMM1[47:40]
//                          // abs( XMM2[55:48] )   -> XMM1[55:48]
//                          // abs( XMM2[63:56] )   -> XMM1[63:56]
//                          // abs( XMM2[71:64] )   -> XMM1[71:64]
//                          // abs( XMM2[79:72] )   -> XMM1[79:72]
//                          // abs( XMM2[87:80] )   -> XMM1[87:80]
//                          // abs( XMM2[95:88] )   -> XMM1[95:88]
//                          // abs( XMM2[103:96] )  -> XMM1[103:96]
//                          // abs( XMM2[111:104] ) -> XMM1[111:104]
//                          // abs( XMM2[119:112] ) -> XMM1[119:112]
//                          // abs( XMM2[127:120] ) -> XMM1[127:120]
//
//  ST2  ST1 PABSB,         // abs( ST2[7:0] )     -> ST1[7:0]
//                          // abs( ST2[15:8] )    -> ST1[15:8]
//                          // abs( ST2[23:16] )   -> ST1[23:16]
//                          // abs( ST2[31:24] )   -> ST1[31:24]
//                          // abs( ST2[39:32] )   -> ST1[39:32]
//                          // abs( ST2[47:40] )   -> ST1[47:40]
//                          // abs( ST2[55:48] )   -> ST1[55:48]
//                          // abs( ST2[63:56] )   -> ST1[63:56]
//
//  ST1  ST2  <-  PABSB, // abs( ST2[7:0] )     -> ST1[7:0]
//                          // abs( ST2[15:8] )    -> ST1[15:8]
//                          // abs( ST2[23:16] )   -> ST1[23:16]
//                          // abs( ST2[31:24] )   -> ST1[31:24]
//                          // abs( ST2[39:32] )   -> ST1[39:32]
//                          // abs( ST2[47:40] )   -> ST1[47:40]
//                          // abs( ST2[55:48] )   -> ST1[55:48]
//                          // abs( ST2[63:56] )   -> ST1[63:56]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpabsbcomma ( VPABSB, )
//
// C prototype:
//  void dg_forthvpabsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPABSB instruction. This opcode sequence gets the absolute value of
//   each signed 8 bit integer in the memory, xmm register, or ymm
//   register source and puts the result into the xmm register or 
//   ymm register destination. The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPABSB,  // abs( [RBX][7:0] )     -> XMM1[7:0]
//                          // abs( [RBX][15:8] )    -> XMM1[15:8]
//                          // abs( [RBX][23:16] )   -> XMM1[23:16]
//                          // abs( [RBX][31:24] )   -> XMM1[31:24]
//                          // abs( [RBX][39:32] )   -> XMM1[39:32]
//                          // abs( [RBX][47:40] )   -> XMM1[47:40]
//                          // abs( [RBX][55:48] )   -> XMM1[55:48]
//                          // abs( [RBX][63:56] )   -> XMM1[63:56]
//                          // abs( [RBX][71:64] )   -> XMM1[71:64]
//                          // abs( [RBX][79:72] )   -> XMM1[79:72]
//                          // abs( [RBX][87:80] )   -> XMM1[87:80]
//                          // abs( [RBX][95:88] )   -> XMM1[95:88]
//                          // abs( [RBX][103:96] )  -> XMM1[103:96]
//                          // abs( [RBX][111:104] ) -> XMM1[111:104]
//                          // abs( [RBX][119:112] ) -> XMM1[119:112]
//                          // abs( [RBX][127:120] ) -> XMM1[127:120]
//
//  YMM2  YMM1  VPABSB,     // abs( YMM2[7:0] )     -> YMM1[7:0]
//                          // abs( YMM2[15:8] )    -> YMM1[15:8]
//                          // abs( YMM2[23:16] )   -> YMM1[23:16]
//                          // abs( YMM2[31:24] )   -> YMM1[31:24]
//                          // abs( YMM2[39:32] )   -> YMM1[39:32]
//                          // abs( YMM2[47:40] )   -> YMM1[47:40]
//                          // abs( YMM2[55:48] )   -> YMM1[55:48]
//                          // abs( YMM2[63:56] )   -> YMM1[63:56]
//                          // abs( YMM2[71:64] )   -> YMM1[71:64]
//                          // abs( YMM2[79:72] )   -> YMM1[79:72]
//                          // abs( YMM2[87:80] )   -> YMM1[87:80]
//                          // abs( YMM2[95:88] )   -> YMM1[95:88]
//                          // abs( YMM2[103:96] )  -> YMM1[103:96]
//                          // abs( YMM2[111:104] ) -> YMM1[111:104]
//                          // abs( YMM2[119:112] ) -> YMM1[119:112]
//                          // abs( YMM2[127:120] ) -> YMM1[127:120]
//                          // abs( YMM2[135:128] ) -> YMM1[135:128]
//                          // abs( YMM2[143:136] ) -> YMM1[143:136]
//                          // abs( YMM2[151:144] ) -> YMM1[151:144]
//                          // abs( YMM2[159:152] ) -> YMM1[159:152]
//                          // abs( YMM2[167:160] ) -> YMM1[167:160]
//                          // abs( YMM2[175:168] ) -> YMM1[175:168]
//                          // abs( YMM2[183:176] ) -> YMM1[183:176]
//                          // abs( YMM2[191:184] ) -> YMM1[191:184]
//                          // abs( YMM2[199:192] ) -> YMM1[199:192]
//                          // abs( YMM2[207:200] ) -> YMM1[207:200]
//                          // abs( YMM2[215:208] ) -> YMM1[215:208]
//                          // abs( YMM2[223:216] ) -> YMM1[223:216]
//                          // abs( YMM2[231:224] ) -> YMM1[231:224]
//                          // abs( YMM2[239:232] ) -> YMM1[239:232]
//                          // abs( YMM2[247:240] ) -> YMM1[247:240]
//                          // abs( YMM2[255:248] ) -> YMM1[255:248]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpabsdcomma ( PABSD, )
//
// C prototype:
//  void dg_forthpabsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PABSD instruction. This opcode sequence gets the absolute value of
//   each signed 32 bit integer in the memory, xmm register, or floating point
//   register source and puts the result into the xmm register or floating
//   point register destination. If the source target is not a memory target,
//   both targets must be the same type of register. The destination can not
//   be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PABSD,   // abs( [RBX][31:0] )    -> XMM1[31:0]
//                          // abs( [RBX][63:32] )   -> XMM1[63:32]
//                          // abs( [RBX][95:64] )   -> XMM1[95:64]
//                          // abs( [RBX][127:96] )  -> XMM1[127:96]
//
//  RAX [R]  ST3   PABSD,   // abs( [RAX][31:0] )    -> ST3[31:0]
//                          // abs( [RAX][63:32] )   -> ST3[63:32]
//
//  XMM2  XMM1  PABSD,      // abs( XMM2[31:0] )    -> XMM1[31:0]
//                          // abs( XMM2[63:32] )   -> XMM1[63:32]
//                          // abs( XMM2[95:64] )   -> XMM1[95:64]
//                          // abs( XMM2[127:96] )  -> XMM1[127:96]
//
//  ST2  ST1 PABSD,         // abs( ST2[31:0] )     -> ST1[31:0]
//                          // abs( ST2[63:32] )    -> ST1[63:32]
//
//  ST1  ST2  <-  PABSD, // abs( ST2[31:0] )     -> ST1[31:0]
//                          // abs( ST2[63:32] )    -> ST1[63:32]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpabsdcomma ( VPABSD, )
//
// C prototype:
//  void dg_forthvpabsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPABSD instruction. This opcode sequence gets the absolute value of
//   each signed 32 bit integer in the memory, xmm register, or ymm
//   register source and puts the result into the xmm register or 
//   ymm register destination. If the source target is not a memory target,
//   both targets must be the same type of register. The destination can not
//   be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPABSD,  // abs( [RBX][31:0] )    -> XMM1[31:0]
//                          // abs( [RBX][63:32] )   -> XMM1[63:32]
//                          // abs( [RBX][95:64] )   -> XMM1[95:64]
//                          // abs( [RBX][127:96] )  -> XMM1[127:96]
//
//  YMM2  YMM1  VPABSD,     // abs( YMM2[31:0] )    -> YMM1[31:0]
//                          // abs( YMM2[63:32] )   -> YMM1[63:32]
//                          // abs( YMM2[95:64] )   -> YMM1[95:64]
//                          // abs( YMM2[127:96] )  -> YMM1[127:96]
//                          // abs( YMM2[159:128] ) -> YMM1[159:128]
//                          // abs( YMM2[191:160] ) -> YMM1[191:160]
//                          // abs( YMM2[223:192] ) -> YMM1[223:192]
//                          // abs( YMM2[255:224] ) -> YMM1[255:224]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpabswcomma ( PABSW, )
//
// C prototype:
//  void dg_forthpabswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PABSW instruction. This opcode sequence gets the absolute value of
//   each signed 16 bit integer in the memory, xmm register, or floating point
//   register source and puts the result into the xmm register or floating
//   point register destination. If the source target is not a memory target,
//   both targets must be the same type of register. The destination can not
//   be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PABSW,   // abs( [RBX][15:0] )    -> XMM1[15:0]
//                          // abs( [RBX][31:16] )   -> XMM1[31:16]
//                          // abs( [RBX][47:32] )   -> XMM1[47:32]
//                          // abs( [RBX][63:48] )   -> XMM1[63:48]
//                          // abs( [RBX][79:64] )   -> XMM1[79:64]
//                          // abs( [RBX][95:80] )   -> XMM1[95:80]
//                          // abs( [RBX][111:96] )  -> XMM1[111:96]
//                          // abs( [RBX][127:112] ) -> XMM1[127:112]
//
//  RAX [R]  ST3   PABSW,   // abs( [RAX][15:0] )    -> ST3[15:0]
//                          // abs( [RAX][31:16] )   -> ST3[31:16]
//                          // abs( [RAX][47:32] )   -> ST3[47:32]
//                          // abs( [RAX][63:48] )   -> ST3[63:48]
//
//  XMM2  XMM1  PABSW,      // abs( XMM2[15:0] )    -> XMM1[15:0]
//                          // abs( XMM2[31:16] )   -> XMM1[31:16]
//                          // abs( XMM2[47:32] )   -> XMM1[47:32]
//                          // abs( XMM2[63:48] )   -> XMM1[63:48]
//                          // abs( XMM2[79:64] )   -> XMM1[79:64]
//                          // abs( XMM2[95:80] )   -> XMM1[95:80]
//                          // abs( XMM2[111:96] )  -> XMM1[111:96]
//                          // abs( XMM2[127:112] ) -> XMM1[127:112]
//
//  ST2  ST1 PABSW,         // abs( ST2[15:0] )    -> ST1[15:0]
//                          // abs( ST2[31:16] )   -> ST1[31:16]
//                          // abs( ST2[47:32] )   -> ST1[47:32]
//                          // abs( ST2[63:48] )   -> ST1[63:48]
//
//  ST1  ST2  <-  PABSW, // abs( ST2[15:0] )    -> ST1[15:0]
//                          // abs( ST2[31:16] )   -> ST1[31:16]
//                          // abs( ST2[47:32] )   -> ST1[47:32]
//                          // abs( ST2[63:48] )   -> ST1[63:48]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpabswcomma ( VPABSW, )
//
// C prototype:
//  void dg_forthvpabswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPABSW instruction. This opcode sequence gets the absolute value of
//   each signed 16 bit integer in the memory, xmm register, or ymm
//   register source and puts the result into the xmm register or 
//   ymm register destination. If the source target is not a memory target,
//   both targets must be the same type of register. The destination can not
//   be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPABSW,  // abs( [RBX][15:0] )    -> XMM1[15:0]
//                          // abs( [RBX][31:16] )   -> XMM1[31:16]
//                          // abs( [RBX][47:32] )   -> XMM1[47:32]
//                          // abs( [RBX][63:48] )   -> XMM1[63:48]
//                          // abs( [RBX][79:64] )   -> XMM1[79:64]
//                          // abs( [RBX][95:80] )   -> XMM1[95:80]
//                          // abs( [RBX][111:96] )  -> XMM1[111:96]
//                          // abs( [RBX][127:112] ) -> XMM1[127:112]
//
//  YMM2  YMM1  VPABSW,     // abs( YMM2[15:0] )    -> YMM1[15:0]
//                          // abs( YMM2[31:16] )   -> YMM1[31:16]
//                          // abs( YMM2[47:32] )   -> YMM1[47:32]
//                          // abs( YMM2[63:48] )   -> YMM1[63:48]
//                          // abs( YMM2[79:64] )   -> YMM1[79:64]
//                          // abs( YMM2[95:80] )   -> YMM1[95:80]
//                          // abs( YMM2[111:96] )  -> YMM1[111:96]
//                          // abs( YMM2[127:112] ) -> YMM1[127:112]
//                          // abs( YMM2[143:128] ) -> YMM1[143:128]
//                          // abs( YMM2[159:144] ) -> YMM1[159:144]
//                          // abs( YMM2[175:160] ) -> YMM1[175:160]
//                          // abs( YMM2[191:176] ) -> YMM1[191:176]
//                          // abs( YMM2[207:192] ) -> YMM1[207:192]
//                          // abs( YMM2[223:208] ) -> YMM1[223:208]
//                          // abs( YMM2[239:224] ) -> YMM1[239:224]
//                          // abs( YMM2[255:240] ) -> YMM1[255:240]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpacksswbcomma ( PACKSSWB, )
//
// C prototype:
//  void dg_forthpacksswbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PACKSSWB instruction. This opcode sequence gets each signed 16 bit
//   integer from the source and each signed 16 bit integer from the
//   destination, converts them into signed 8 bit integers and puts the
//   results into the destination. The results from the destination go into
//   the lower half of the destination. If the result of a conversion is outside
//   the range of a signed 8 bit integer, the result gets clipped to the
//   maximum or minimum signed 8 bit value possible.
//   If the source target is not a memory target, both targets must be the same
//   type of register. The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PACKSSWB,   // XMM1[15:0]     INT16->INT8   -> XMM1[7:0]
//                             // XMM1[31:16]    INT16->INT8   -> XMM1[15:8]
//                             // XMM1[47:32]    INT16->INT8   -> XMM1[23:16]
//                             // XMM1[63:48]    INT16->INT8   -> XMM1[31:24]
//                             // XMM1[79:64]    INT16->INT8   -> XMM1[39:32]
//                             // XMM1[95:80]    INT16->INT8   -> XMM1[47:40]
//                             // XMM1[111:96]   INT16->INT8   -> XMM1[55:48]
//                             // XMM1[127:112]  INT16->INT8   -> XMM1[63:56]
//                             // [RBX][15:0]    INT16->INT8   -> XMM1[71:64]
//                             // [RBX][31:16]   INT16->INT8   -> XMM1[79:72]
//                             // [RBX][47:32]   INT16->INT8   -> XMM1[87:80]
//                             // [RBX][63:48]   INT16->INT8   -> XMM1[95:88]
//                             // [RBX][79:64]   INT16->INT8   -> XMM1[103:96]
//                             // [RBX][95:80]   INT16->INT8   -> XMM1[111:104]
//                             // [RBX][111:96]  INT16->INT8   -> XMM1[119:112]
//                             // [RBX][127:112] INT16->INT8   -> XMM1[127:120]
//
//  RAX [R]  ST3   PACKSSWB,   // ST3[15:0]     INT16->INT8   -> ST3[7:0]
//                             // ST3[31:16]    INT16->INT8   -> ST3[15:8]
//                             // ST3[47:32]    INT16->INT8   -> ST3[23:16]
//                             // ST3[63:48]    INT16->INT8   -> ST3[31:24]
//                             // [RAX][15:0]   INT16->INT8   -> ST3[39:32]
//                             // [RAX][31:16]  INT16->INT8   -> ST3[47:40]
//                             // [RAX][47:32]  INT16->INT8   -> ST3[55:48]
//                             // [RAX][63:48]  INT16->INT8   -> ST3[63:56]
//
//  XMM2  XMM1  PACKSSWB,      // XMM1[15:0]     INT16->INT8   -> XMM1[7:0]
//                             // XMM1[31:16]    INT16->INT8   -> XMM1[15:8]
//                             // XMM1[47:32]    INT16->INT8   -> XMM1[23:16]
//                             // XMM1[63:48]    INT16->INT8   -> XMM1[31:24]
//                             // XMM1[79:64]    INT16->INT8   -> XMM1[39:32]
//                             // XMM1[95:80]    INT16->INT8   -> XMM1[47:40]
//                             // XMM1[111:96]   INT16->INT8   -> XMM1[55:48]
//                             // XMM1[127:112]  INT16->INT8   -> XMM1[63:56]
//                             // XMM2[15:0]     INT16->INT8   -> XMM1[71:64]
//                             // XMM2[31:16]    INT16->INT8   -> XMM1[79:72]
//                             // XMM2[47:32]    INT16->INT8   -> XMM1[87:80]
//                             // XMM2[63:48]    INT16->INT8   -> XMM1[95:88]
//                             // XMM2[79:64]    INT16->INT8   -> XMM1[103:96]
//                             // XMM2[95:80]    INT16->INT8   -> XMM1[111:104]
//                             // XMM2[111:96]   INT16->INT8   -> XMM1[119:112]
//                             // XMM2[127:112]  INT16->INT8   -> XMM1[127:120]
//
//  ST2  ST1 PACKSSWB,         // ST1[15:0]   INT16->INT8   -> ST1[7:0]
//                             // ST1[31:16]  INT16->INT8   -> ST1[15:8]
//                             // ST1[47:32]  INT16->INT8   -> ST1[23:16]
//                             // ST1[63:48]  INT16->INT8   -> ST1[31:24]
//                             // ST2[15:0]   INT16->INT8   -> ST1[39:32]
//                             // ST2[31:16]  INT16->INT8   -> ST1[47:40]
//                             // ST2[47:32]  INT16->INT8   -> ST1[55:48]
//                             // ST2[63:48]  INT16->INT8   -> ST1[63:56]
//
//  ST1  ST2  <-  PACKSSWB, // ST1[15:0]   INT16->INT8   -> ST1[7:0]
//                             // ST1[31:16]  INT16->INT8   -> ST1[15:8]
//                             // ST1[47:32]  INT16->INT8   -> ST1[23:16]
//                             // ST1[63:48]  INT16->INT8   -> ST1[31:24]
//                             // ST2[15:0]   INT16->INT8   -> ST1[39:32]
//                             // ST2[31:16]  INT16->INT8   -> ST1[47:40]
//                             // ST2[47:32]  INT16->INT8   -> ST1[55:48]
//                             // ST2[63:48]  INT16->INT8   -> ST1[63:56]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpacksswbcomma ( VPACKSSWB, )
//
// C prototype:
//  void dg_forthvpacksswbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPACKSSWB instruction. This opcode sequence gets each signed 16 bit
//   integer from the source and each signed 16 bit integer from 
//   targert y, converts them into signed 8 bit integers and puts the
//   results into the destination. The results from target y go into the lower
//   half of each 128 bit section of the destination. If the result of a 
//   conversion is outside the range of a signed 8 bit integer, the result gets 
//   clipped to the maximum or minimum signed 8 bit value possible.
//   If the source target is not a memory target, all three targets must be the 
//   same type of register. The destination and target y can not be a memory 
//   target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPACKSSWB,   
//                         // XMM0[15:0]     INT16->INT8   -> XMM1[7:0]
//                         // XMM0[31:16]    INT16->INT8   -> XMM1[15:8]
//                         // XMM0[47:32]    INT16->INT8   -> XMM1[23:16]
//                         // XMM0[63:48]    INT16->INT8   -> XMM1[31:24]
//                         // XMM0[79:64]    INT16->INT8   -> XMM1[39:32]
//                         // XMM0[95:80]    INT16->INT8   -> XMM1[47:40]
//                         // XMM0[111:96]   INT16->INT8   -> XMM1[55:48]
//                         // XMM0[127:112]  INT16->INT8   -> XMM1[63:56]
//                         // [RBX][15:0]    INT16->INT8   -> XMM1[71:64]
//                         // [RBX][31:16]   INT16->INT8   -> XMM1[79:72]
//                         // [RBX][47:32]   INT16->INT8   -> XMM1[87:80]
//                         // [RBX][63:48]   INT16->INT8   -> XMM1[95:88]
//                         // [RBX][79:64]   INT16->INT8   -> XMM1[103:96]
//                         // [RBX][95:80]   INT16->INT8   -> XMM1[111:104]
//                         // [RBX][111:96]  INT16->INT8   -> XMM1[119:112]
//                         // [RBX][127:112] INT16->INT8   -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPACKSSWB,     
//                         // YMM0[15:0]     INT16->INT8   -> YMM1[7:0]
//                         // YMM0[31:16]    INT16->INT8   -> YMM1[15:8]
//                         // YMM0[47:32]    INT16->INT8   -> YMM1[23:16]
//                         // YMM0[63:48]    INT16->INT8   -> YMM1[31:24]
//                         // YMM0[79:64]    INT16->INT8   -> YMM1[39:32]
//                         // YMM0[95:80]    INT16->INT8   -> YMM1[47:40]
//                         // YMM0[111:96]   INT16->INT8   -> YMM1[55:48]
//                         // YMM0[127:112]  INT16->INT8   -> YMM1[63:56]
//                         // YMM2[15:0]     INT16->INT8   -> YMM1[71:64]
//                         // YMM2[31:16]    INT16->INT8   -> YMM1[79:72]
//                         // YMM2[47:32]    INT16->INT8   -> YMM1[87:80]
//                         // YMM2[63:48]    INT16->INT8   -> YMM1[95:88]
//                         // YMM2[79:64]    INT16->INT8   -> YMM1[103:96]
//                         // YMM2[95:80]    INT16->INT8   -> YMM1[111:104]
//                         // YMM2[111:96]   INT16->INT8   -> YMM1[119:112]
//                         // YMM2[127:112]  INT16->INT8   -> YMM1[127:120]
//                         // YMM0[143:128]  INT16->INT8   -> YMM1[135:128]
//                         // YMM0[159:144]  INT16->INT8   -> YMM1[143:136]
//                         // YMM0[175:160]  INT16->INT8   -> YMM1[151:144]
//                         // YMM0[191:176]  INT16->INT8   -> YMM1[159:152]
//                         // YMM0[207:192]  INT16->INT8   -> YMM1[167:160]
//                         // YMM0[223:208]  INT16->INT8   -> YMM1[175:168]
//                         // YMM0[239:224]  INT16->INT8   -> YMM1[183:176]
//                         // YMM0[255:240]  INT16->INT8   -> YMM1[191:184]
//                         // YMM2[143:128]  INT16->INT8   -> YMM1[199:192]
//                         // YMM2[159:144]  INT16->INT8   -> YMM1[207:200]
//                         // YMM2[175:160]  INT16->INT8   -> YMM1[215:208]
//                         // YMM2[191:176]  INT16->INT8   -> YMM1[223:216]
//                         // YMM2[207:192]  INT16->INT8   -> YMM1[231:224]
//                         // YMM2[223:208]  INT16->INT8   -> YMM1[239:232]
//                         // YMM2[239:224]  INT16->INT8   -> YMM1[247:240]
//                         // YMM2[255:224]  INT16->INT8   -> YMM1[255:248]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpackssdwcomma ( PACKSSDW, )
//
// C prototype:
//  void dg_forthpackssdwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PACKSSDW instruction. This opcode sequence gets each signed 32 bit
//   integer from the source and each signed 32 bit integer from the
//   destination, converts them into signed 16 bit integers and puts the
//   results into the destination. The results from the destination go into
//   the lower half of the destination. If the result of a conversion is outside
//   the range of a signed 16 bit integer, the result gets clipped to the
//   maximum or minimum signed 16 bit value possible.
//   If the source target is not a memory target, both targets must be the same
//   type of register. The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PACKSSDW,   // XMM1[31:0]    INT16->INT8   -> XMM1[15:0]
//                             // XMM1[63:32]   INT16->INT8   -> XMM1[31:16]
//                             // XMM1[95:64]   INT16->INT8   -> XMM1[47:32]
//                             // XMM1[127:96]  INT16->INT8   -> XMM1[63:48]
//                             // [RBX][31:0]   INT16->INT8   -> XMM1[79:64]
//                             // [RBX][63:32]  INT16->INT8   -> XMM1[95:80]
//                             // [RBX][95:64]  INT16->INT8   -> XMM1[111:96]
//                             // [RBX][127:96] INT16->INT8   -> XMM1[127:112]
//
//  RAX [R]  ST3   PACKSSDW,   // XMM1[31:0]   INT16->INT8   -> XMM1[15:0]
//                             // XMM1[63:32]  INT16->INT8   -> XMM1[31:16]
//                             // [RAX][31:0]  INT16->INT8   -> XMM1[47:32]
//                             // [RAX][63:32] INT16->INT8   -> XMM1[63:48]
//
//  XMM2  XMM1  PACKSSDW,      // XMM1[31:0]   INT16->INT8  -> XMM1[15:0]
//                             // XMM1[63:32]  INT16->INT8  -> XMM1[31:16]
//                             // XMM1[95:64]  INT16->INT8  -> XMM1[47:32]
//                             // XMM1[127:96] INT16->INT8  -> XMM1[63:48]
//                             // XMM2[31:0]   INT16->INT8  -> XMM1[79:64]
//                             // XMM2[63:32]  INT16->INT8  -> XMM1[95:80]
//                             // XMM2[95:64]  INT16->INT8  -> XMM1[111:96]
//                             // XMM2[127:96] INT16->INT8  -> XMM1[127:112]
//
//  ST2  ST1 PACKSSDW,         // ST1[31:0]  INT16->INT8   -> ST1[15:0]
//                             // ST1[63:32] INT16->INT8   -> ST1[31:16]
//                             // ST2[31:0]  INT16->INT8   -> ST1[47:32]
//                             // ST2[63:32] INT16->INT8   -> ST1[63:48]
//
//  ST1  ST2  <-  PACKSSDW, // ST1[31:0]   INT16->INT8  -> ST2[15:0]
//                             // ST1[63:32]  INT16->INT8  -> ST2[31:16]
//                             // ST1[95:64]  INT16->INT8  -> ST2[47:32]
//                             // ST1[127:96] INT16->INT8  -> ST2[63:48]
//                             // ST2[31:0]   INT16->INT8  -> ST2[79:64]
//                             // ST2[63:32]  INT16->INT8  -> ST2[95:80]
//                             // ST2[95:64]  INT16->INT8  -> ST2[111:96]
//                             // ST2[127:96] INT16->INT8  -> ST2[127:112]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpackssdwcomma ( VPACKSSDW, )
//
// C prototype:
//  void dg_forthvpackssdwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPACKSSDW instruction. This opcode sequence gets each signed 32 bit
//   integer from the source and each signed 32 bit integer from target y
//   converts them into signed 16 bit integers and puts the 
//   results into the destination. The results from target y go into the lower 
//   half of each 128 bit section of the destination. If the result of a 
//   conversion is outside the range of a signed 16 bit integer, the result gets 
//   clipped to the maximum or minimum signed 16 bit value possible.
//   If the source target is not a memory target, all three targets must be the 
//   same type of register. The destination and target y can not be a memory 
//   target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM2  XMM1  VPACKSSDW,  
//                          // XMM2[31:0]    INT16->INT8   -> XMM1[15:0]
//                          // XMM2[63:32]   INT16->INT8   -> XMM1[31:16]
//                          // XMM2[95:64]   INT16->INT8   -> XMM1[47:32]
//                          // XMM2[127:96]  INT16->INT8   -> XMM1[63:48]
//                          // [RBX][31:0]   INT16->INT8   -> XMM1[79:64]
//                          // [RBX][63:32]  INT16->INT8   -> XMM1[95:80]
//                          // [RBX][95:64]  INT16->INT8   -> XMM1[111:96]
//                          // [RBX][127:96] INT16->INT8   -> XMM1[127:112]
//
//
//  YMM2  YMM3  YMM1  VPACKSSDW,     
//                          // YMM3[31:0]    INT16->INT8  -> YMM1[15:0]
//                          // YMM3[63:32]   INT16->INT8  -> YMM1[31:16]
//                          // YMM3[95:64]   INT16->INT8  -> YMM1[47:32]
//                          // YMM3[127:96]  INT16->INT8  -> YMM1[63:48]
//                          // YMM2[31:0]    INT16->INT8  -> YMM1[79:64]
//                          // YMM2[63:32]   INT16->INT8  -> YMM1[95:80]
//                          // YMM2[95:64]   INT16->INT8  -> YMM1[111:96]
//                          // YMM2[127:96]  INT16->INT8  -> YMM1[143:128]
//                          // YMM3[159:128] INT16->INT8  -> YMM1[143:128]
//                          // YMM3[191:160] INT16->INT8  -> YMM1[159:144]
//                          // YMM3[223:192] INT16->INT8  -> YMM1[175:160]
//                          // YMM3[255:224] INT16->INT8  -> YMM1[191:176]
//                          // YMM2[159:128] INT16->INT8  -> YMM1[207:192]
//                          // YMM2[191:160] INT16->INT8  -> YMM1[223:208]
//                          // YMM2[223:192] INT16->INT8  -> YMM1[239:224]
//                          // YMM2[255:224] INT16->INT8  -> YMM1[255:240]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpackuswbcomma ( PACKUSWB, )
//
// C prototype:
//  void dg_forthpackuswbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PACKUSWB instruction. This opcode sequence gets each unsigned 16 bit
//   integer from the source and each unsigned 16 bit integer from the
//   destination, converts them into unsigned 8 bit integers and puts the
//   results into the destination. The results from the destination go into
//   the lower half of the destination. If the result of a conversion is outside
//   the range of an unsigned 8 bit integer, the result gets clipped to the
//   maximum unsigned 8 bit value possible.
//   If the source target is not a memory target, both targets must be the same
//   type of register. The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PACKUSWB, // XMM1[15:0]     UINT16->UINT8 -> XMM1[7:0]
//                           // XMM1[31:16]    UINT16->UINT8 -> XMM1[15:8]
//                           // XMM1[47:32]    UINT16->UINT8 -> XMM1[23:16]
//                           // XMM1[63:48]    UINT16->UINT8 -> XMM1[31:24]
//                           // XMM1[79:64]    UINT16->UINT8 -> XMM1[39:32]
//                           // XMM1[95:80]    UINT16->UINT8 -> XMM1[47:40]
//                           // XMM1[111:96]   UINT16->UINT8 -> XMM1[55:48]
//                           // XMM1[127:112]  UINT16->UINT8 -> XMM1[63:56]
//                           // [RBX][15:0]    UINT16->UINT8 -> XMM1[71:64]
//                           // [RBX][31:16]   UINT16->UINT8 -> XMM1[79:72]
//                           // [RBX][47:32]   UINT16->UINT8 -> XMM1[87:80]
//                           // [RBX][63:48]   UINT16->UINT8 -> XMM1[95:88]
//                           // [RBX][79:64]   UINT16->UINT8 -> XMM1[103:96]
//                           // [RBX][95:80]   UINT16->UINT8 -> XMM1[111:104]
//                           // [RBX][111:96]  UINT16->UINT8 -> XMM1[119:112]
//                           // [RBX][127:112] UINT16->UINT8 -> XMM1[127:120]
//
//  RAX [R]  ST3   PACKUSWB,   // ST3[15:0]     UINT16->UINT8   -> ST3[7:0]
//                             // ST3[31:16]    UINT16->UINT8   -> ST3[15:8]
//                             // ST3[47:32]    UINT16->UINT8   -> ST3[23:16]
//                             // ST3[63:48]    UINT16->UINT8   -> ST3[31:24]
//                             // [RAX][15:0]   UINT16->UINT8   -> ST3[39:32]
//                             // [RAX][31:16]  UINT16->UINT8   -> ST3[47:40]
//                             // [RAX][47:32]  UINT16->UINT8   -> ST3[55:48]
//                             // [RAX][63:48]  UINT16->UINT8   -> ST3[63:56]
//
//  XMM2  XMM1  PACKUSWB,  // XMM1[15:0]     UINT16->UINT8   -> XMM1[7:0]
//                         // XMM1[31:16]    UINT16->UINT8   -> XMM1[15:8]
//                         // XMM1[47:32]    UINT16->UINT8   -> XMM1[23:16]
//                         // XMM1[63:48]    UINT16->UINT8   -> XMM1[31:24]
//                         // XMM1[79:64]    UINT16->UINT8   -> XMM1[39:32]
//                         // XMM1[95:80]    UINT16->UINT8   -> XMM1[47:40]
//                         // XMM1[111:96]   UINT16->UINT8   -> XMM1[55:48]
//                         // XMM1[127:112]  UINT16->UINT8   -> XMM1[63:56]
//                         // XMM2[15:0]     UINT16->UINT8   -> XMM1[71:64]
//                         // XMM2[31:16]    UINT16->UINT8   -> XMM1[79:72]
//                         // XMM2[47:32]    UINT16->UINT8   -> XMM1[87:80]
//                         // XMM2[63:48]    UINT16->UINT8   -> XMM1[95:88]
//                         // XMM2[79:64]    UINT16->UINT8   -> XMM1[103:96]
//                         // XMM2[95:80]    UINT16->UINT8   -> XMM1[111:104]
//                         // XMM2[111:96]   UINT16->UINT8   -> XMM1[119:112]
//                         // XMM2[127:112]  UINT16->UINT8   -> XMM1[127:120]
//
//  ST2  ST1 PACKUSWB,         // ST1[15:0]   UINT16->UINT8   -> ST1[7:0]
//                             // ST1[31:16]  UINT16->UINT8   -> ST1[15:8]
//                             // ST1[47:32]  UINT16->UINT8   -> ST1[23:16]
//                             // ST1[63:48]  UINT16->UINT8   -> ST1[31:24]
//                             // ST2[15:0]   UINT16->UINT8   -> ST1[39:32]
//                             // ST2[31:16]  UINT16->UINT8   -> ST1[47:40]
//                             // ST2[47:32]  UINT16->UINT8   -> ST1[55:48]
//                             // ST2[63:48]  UINT16->UINT8   -> ST1[63:56]
//
//  ST1  ST2  <-  PACKUSWB, // ST1[15:0]   UINT16->UINT8   -> ST1[7:0]
//                             // ST1[31:16]  UINT16->UINT8   -> ST1[15:8]
//                             // ST1[47:32]  UINT16->UINT8   -> ST1[23:16]
//                             // ST1[63:48]  UINT16->UINT8   -> ST1[31:24]
//                             // ST2[15:0]   UINT16->UINT8   -> ST1[39:32]
//                             // ST2[31:16]  UINT16->UINT8   -> ST1[47:40]
//                             // ST2[47:32]  UINT16->UINT8   -> ST1[55:48]
//                             // ST2[63:48]  UINT16->UINT8   -> ST1[63:56]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpackuswbcomma ( VPACKUSWB, )
//
// C prototype:
//  void dg_forthvpackuswbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPACKUSWB instruction. This opcode sequence gets each unsigned 16 bit
//   integer from the source and each unsigned 16 bit integer from 
//   targert y, converts them into unsigned 8 bit integers and puts the
//   results into the destination. The results from target y go into the lower
//   half of each 128 bit section of the destination. If the result of a 
//   conversion is outside the range of a unsigned 8 bit integer, the result gets 
//   clipped to the maximum unsigned 8 bit value possible.
//   If the source target is not a memory target, all three targets must be the 
//   same type of register. The destination and target y can not be a memory 
//   target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPACKUSWB,   
//                         // XMM0[15:0]     UINT16->UINT8   -> XMM1[7:0]
//                         // XMM0[31:16]    UINT16->UINT8   -> XMM1[15:8]
//                         // XMM0[47:32]    UINT16->UINT8   -> XMM1[23:16]
//                         // XMM0[63:48]    UINT16->UINT8   -> XMM1[31:24]
//                         // XMM0[79:64]    UINT16->UINT8   -> XMM1[39:32]
//                         // XMM0[95:80]    UINT16->UINT8   -> XMM1[47:40]
//                         // XMM0[111:96]   UINT16->UINT8   -> XMM1[55:48]
//                         // XMM0[127:112]  UINT16->UINT8   -> XMM1[63:56]
//                         // [RBX][15:0]    UINT16->UINT8   -> XMM1[71:64]
//                         // [RBX][31:16]   UINT16->UINT8   -> XMM1[79:72]
//                         // [RBX][47:32]   UINT16->UINT8   -> XMM1[87:80]
//                         // [RBX][63:48]   UINT16->UINT8   -> XMM1[95:88]
//                         // [RBX][79:64]   UINT16->UINT8   -> XMM1[103:96]
//                         // [RBX][95:80]   UINT16->UINT8   -> XMM1[111:104]
//                         // [RBX][111:96]  UINT16->UINT8   -> XMM1[119:112]
//                         // [RBX][127:112] UINT16->UINT8   -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPACKUSWB,     
//                         // YMM0[15:0]     UINT16->UINT8   -> YMM1[7:0]
//                         // YMM0[31:16]    UINT16->UINT8   -> YMM1[15:8]
//                         // YMM0[47:32]    UINT16->UINT8   -> YMM1[23:16]
//                         // YMM0[63:48]    UINT16->UINT8   -> YMM1[31:24]
//                         // YMM0[79:64]    UINT16->UINT8   -> YMM1[39:32]
//                         // YMM0[95:80]    UINT16->UINT8   -> YMM1[47:40]
//                         // YMM0[111:96]   UINT16->UINT8   -> YMM1[55:48]
//                         // YMM0[127:112]  UINT16->UINT8   -> YMM1[63:56]
//                         // YMM2[15:0]     UINT16->UINT8   -> YMM1[71:64]
//                         // YMM2[31:16]    UINT16->UINT8   -> YMM1[79:72]
//                         // YMM2[47:32]    UINT16->UINT8   -> YMM1[87:80]
//                         // YMM2[63:48]    UINT16->UINT8   -> YMM1[95:88]
//                         // YMM2[79:64]    UINT16->UINT8   -> YMM1[103:96]
//                         // YMM2[95:80]    UINT16->UINT8   -> YMM1[111:104]
//                         // YMM2[111:96]   UINT16->UINT8   -> YMM1[119:112]
//                         // YMM2[127:112]  UINT16->UINT8   -> YMM1[127:120]
//                         // YMM0[143:128]  UINT16->UINT8   -> YMM1[135:128]
//                         // YMM0[159:144]  UINT16->UINT8   -> YMM1[143:136]
//                         // YMM0[175:160]  UINT16->UINT8   -> YMM1[151:144]
//                         // YMM0[191:176]  UINT16->UINT8   -> YMM1[159:152]
//                         // YMM0[207:192]  UINT16->UINT8   -> YMM1[167:160]
//                         // YMM0[223:208]  UINT16->UINT8   -> YMM1[175:168]
//                         // YMM0[239:224]  UINT16->UINT8   -> YMM1[183:176]
//                         // YMM0[255:240]  UINT16->UINT8   -> YMM1[191:184]
//                         // YMM2[143:128]  UINT16->UINT8   -> YMM1[199:192]
//                         // YMM2[159:144]  UINT16->UINT8   -> YMM1[207:200]
//                         // YMM2[175:160]  UINT16->UINT8   -> YMM1[215:208]
//                         // YMM2[191:176]  UINT16->UINT8   -> YMM1[223:216]
//                         // YMM2[207:192]  UINT16->UINT8   -> YMM1[231:224]
//                         // YMM2[223:208]  UINT16->UINT8   -> YMM1[239:232]
//                         // YMM2[239:224]  UINT16->UINT8   -> YMM1[247:240]
//                         // YMM2[255:224]  UINT16->UINT8   -> YMM1[255:248]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpavgbcomma ( PAVGB, )
//
// C prototype:
//  void dg_forthpavgbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PAVGB instruction. This opcode sequence gets the average of each
//   unsigned byte in the memory, xmm register, or floating point register
//   source and the corresponding unsigned byte in the xmm register or
//   floating point register destination and puts the results into the
//   destination. If the source target is not a
//   memory target, both targets must be the same type of register.
//   The destination can not be a memory target.
//   For each pair of bytes the calculation is: (source[i] + dest[i] + 1)/2
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PAVGB,
//   // ( [RBX][7:0]     + XMM1[7:0]     + 1 )/2  -> XMM1[7:0]
//   // ( [RBX][15:8]    + XMM1[15:8]    + 1 )/2  -> XMM1[15:8]
//   // ( [RBX][23:16]   + XMM1[23:16]   + 1 )/2  -> XMM1[23:16]
//   // ( [RBX][31:24]   + XMM1[31:24]   + 1 )/2  -> XMM1[31:24]
//   // ( [RBX][39:32]   + XMM1[39:32]   + 1 )/2  -> XMM1[39:32]
//   // ( [RBX][47:40]   + XMM1[47:40]   + 1 )/2  -> XMM1[47:40]
//   // ( [RBX][55:48]   + XMM1[55:48]   + 1 )/2  -> XMM1[55:48]
//   // ( [RBX][63:56]   + XMM1[63:56]   + 1 )/2  -> XMM1[63:56]
//   // ( [RBX][71:64]   + XMM1[71:64]   + 1 )/2  -> XMM1[71:64]
//   // ( [RBX][79:72]   + XMM1[79:72]   + 1 )/2  -> XMM1[79:72]
//   // ( [RBX][87:80]   + XMM1[87:80]   + 1 )/2  -> XMM1[87:80]
//   // ( [RBX][95:88]   + XMM1[95:88]   + 1 )/2  -> XMM1[95:88]
//   // ( [RBX][103:96]  + XMM1[103:96]  + 1 )/2  -> XMM1[103:96]
//   // ( [RBX][111:104] + XMM1[111:104] + 1 )/2  -> XMM1[111:104]
//   // ( [RBX][119:112] + XMM1[119:112] + 1 )/2  -> XMM1[119:112]
//   // ( [RBX][127:120] + XMM1[127:120] + 1 )/2  -> XMM1[127:120]
//
//  RAX [R]  ST3   PAVGB,
//   // ( [RAX][7:0]     + ST3[7:0]     + 1 )/2  -> ST3[7:0]
//   // ( [RAX][15:8]    + ST3[15:8]    + 1 )/2  -> ST3[15:8]
//   // ( [RAX][23:16]   + ST3[23:16]   + 1 )/2  -> ST3[23:16]
//   // ( [RAX][31:24]   + ST3[31:24]   + 1 )/2  -> ST3[31:24]
//   // ( [RAX][39:32]   + ST3[39:32]   + 1 )/2  -> ST3[39:32]
//   // ( [RAX][47:40]   + ST3[47:40]   + 1 )/2  -> ST3[47:40]
//   // ( [RAX][55:48]   + ST3[55:48]   + 1 )/2  -> ST3[55:48]
//   // ( [RAX][63:56]   + ST3[63:56]   + 1 )/2  -> ST3[63:56]
//
//  XMM2  XMM1  PAVGB,
//   // ( XMM2[7:0]     + XMM1[7:0]     + 1 )/2  -> XMM1[7:0]
//   // ( XMM2[15:8]    + XMM1[15:8]    + 1 )/2  -> XMM1[15:8]
//   // ( XMM2[23:16]   + XMM1[23:16]   + 1 )/2  -> XMM1[23:16]
//   // ( XMM2[31:24]   + XMM1[31:24]   + 1 )/2  -> XMM1[31:24]
//   // ( XMM2[39:32]   + XMM1[39:32]   + 1 )/2  -> XMM1[39:32]
//   // ( XMM2[47:40]   + XMM1[47:40]   + 1 )/2  -> XMM1[47:40]
//   // ( XMM2[55:48]   + XMM1[55:48]   + 1 )/2  -> XMM1[55:48]
//   // ( XMM2[63:56]   + XMM1[63:56]   + 1 )/2  -> XMM1[63:56]
//   // ( XMM2[71:64]   + XMM1[71:64]   + 1 )/2  -> XMM1[71:64]
//   // ( XMM2[79:72]   + XMM1[79:72]   + 1 )/2  -> XMM1[79:72]
//   // ( XMM2[87:80]   + XMM1[87:80]   + 1 )/2  -> XMM1[87:80]
//   // ( XMM2[95:88]   + XMM1[95:88]   + 1 )/2  -> XMM1[95:88]
//   // ( XMM2[103:96]  + XMM1[103:96]  + 1 )/2  -> XMM1[103:96]
//   // ( XMM2[111:104] + XMM1[111:104] + 1 )/2  -> XMM1[111:104]
//   // ( XMM2[119:112] + XMM1[119:112] + 1 )/2  -> XMM1[119:112]
//   // ( XMM2[127:120] + XMM1[127:120] + 1 )/2  -> XMM1[127:120]
//
//  ST2  ST1 PAVGB,
//   // ( ST2[7:0]     + ST1[7:0]     + 1 )/2  -> ST1[7:0]
//   // ( ST2[15:8]    + ST1[15:8]    + 1 )/2  -> ST1[15:8]
//   // ( ST2[23:16]   + ST1[23:16]   + 1 )/2  -> ST1[23:16]
//   // ( ST2[31:24]   + ST1[31:24]   + 1 )/2  -> ST1[31:24]
//   // ( ST2[39:32]   + ST1[39:32]   + 1 )/2  -> ST1[39:32]
//   // ( ST2[47:40]   + ST1[47:40]   + 1 )/2  -> ST1[47:40]
//   // ( ST2[55:48]   + ST1[55:48]   + 1 )/2  -> ST1[55:48]
//   // ( ST2[63:56]   + ST1[63:56]   + 1 )/2  -> ST1[63:56]
//
//  ST1  ST2  <-  PAVGB,
//   // ( ST2[7:0]     + ST1[7:0]     + 1 )/2  -> ST1[7:0]
//   // ( ST2[15:8]    + ST1[15:8]    + 1 )/2  -> ST1[15:8]
//   // ( ST2[23:16]   + ST1[23:16]   + 1 )/2  -> ST1[23:16]
//   // ( ST2[31:24]   + ST1[31:24]   + 1 )/2  -> ST1[31:24]
//   // ( ST2[39:32]   + ST1[39:32]   + 1 )/2  -> ST1[39:32]
//   // ( ST2[47:40]   + ST1[47:40]   + 1 )/2  -> ST1[47:40]
//   // ( ST2[55:48]   + ST1[55:48]   + 1 )/2  -> ST1[55:48]
//   // ( ST2[63:56]   + ST1[63:56]   + 1 )/2  -> ST1[63:56]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpavgbcomma ( VPAVGB, )
//
// C prototype:
//  void dg_forthvpavgbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPAVGB instruction. This opcode sequence gets the average of each
//   unsigned byte in the memory, xmm register, or ymm register
//   source and the corresponding unsigned byte in the xmm register or
//   ymm register in target y and puts the results into the
//   destination. If the source target is not a
//   memory target, all three targets must be the same type of register.
//   The destination can not be a memory target.
//   For each pair of bytes the calculation is: 
//    (source[i] + dest[i] + 1)/2 -> dest[i]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPAVGB,
//   // ( [RBX][7:0]     + XMM0[7:0]     + 1 )/2  -> XMM1[7:0]
//   // ( [RBX][15:8]    + XMM0[15:8]    + 1 )/2  -> XMM1[15:8]
//   // ( [RBX][23:16]   + XMM0[23:16]   + 1 )/2  -> XMM1[23:16]
//   // ( [RBX][31:24]   + XMM0[31:24]   + 1 )/2  -> XMM1[31:24]
//   // ( [RBX][39:32]   + XMM0[39:32]   + 1 )/2  -> XMM1[39:32]
//   // ( [RBX][47:40]   + XMM0[47:40]   + 1 )/2  -> XMM1[47:40]
//   // ( [RBX][55:48]   + XMM0[55:48]   + 1 )/2  -> XMM1[55:48]
//   // ( [RBX][63:56]   + XMM0[63:56]   + 1 )/2  -> XMM1[63:56]
//   // ( [RBX][71:64]   + XMM0[71:64]   + 1 )/2  -> XMM1[71:64]
//   // ( [RBX][79:72]   + XMM0[79:72]   + 1 )/2  -> XMM1[79:72]
//   // ( [RBX][87:80]   + XMM0[87:80]   + 1 )/2  -> XMM1[87:80]
//   // ( [RBX][95:88]   + XMM0[95:88]   + 1 )/2  -> XMM1[95:88]
//   // ( [RBX][103:96]  + XMM0[103:96]  + 1 )/2  -> XMM1[103:96]
//   // ( [RBX][111:104] + XMM0[111:104] + 1 )/2  -> XMM1[111:104]
//   // ( [RBX][119:112] + XMM0[119:112] + 1 )/2  -> XMM1[119:112]
//   // ( [RBX][127:120] + XMM0[127:120] + 1 )/2  -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPAVGB,
//   // ( YMM2[7:0]     + YMM0[7:0]     + 1 )/2  -> YMM1[7:0]
//   // ( YMM2[15:8]    + YMM0[15:8]    + 1 )/2  -> YMM1[15:8]
//   // ( YMM2[23:16]   + YMM0[23:16]   + 1 )/2  -> YMM1[23:16]
//   // ( YMM2[31:24]   + YMM0[31:24]   + 1 )/2  -> YMM1[31:24]
//   // ( YMM2[39:32]   + YMM0[39:32]   + 1 )/2  -> YMM1[39:32]
//   // ( YMM2[47:40]   + YMM0[47:40]   + 1 )/2  -> YMM1[47:40]
//   // ( YMM2[55:48]   + YMM0[55:48]   + 1 )/2  -> YMM1[55:48]
//   // ( YMM2[63:56]   + YMM0[63:56]   + 1 )/2  -> YMM1[63:56]
//   // ( YMM2[71:64]   + YMM0[71:64]   + 1 )/2  -> YMM1[71:64]
//   // ( YMM2[79:72]   + YMM0[79:72]   + 1 )/2  -> YMM1[79:72]
//   // ( YMM2[87:80]   + YMM0[87:80]   + 1 )/2  -> YMM1[87:80]
//   // ( YMM2[95:88]   + YMM0[95:88]   + 1 )/2  -> YMM1[95:88]
//   // ( YMM2[103:96]  + YMM0[103:96]  + 1 )/2  -> YMM1[103:96]
//   // ( YMM2[111:104] + YMM0[111:104] + 1 )/2  -> YMM1[111:104]
//   // ( YMM2[119:112] + YMM0[119:112] + 1 )/2  -> YMM1[119:112]
//   // ( YMM2[127:120] + YMM0[127:120] + 1 )/2  -> YMM1[127:120]
//   // ...
//   // ( YMM2[255:247] + YMM0[255:247] + 1 )/2  -> YMM1[255:247]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpavgwcomma ( PAVGW, )
//
// C prototype:
//  void dg_forthpavgwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PAVGW instruction. This opcode sequence gets the average of each
//   unsigned 16 bit integer in the memory, xmm register, or floating point
//   register source and the corresponding unsigned 16 bit integer in the xmm
//   register or floating point register destination and puts the results
//   into the destination. If the source target is not a memory target,
//   both targets must be the same type of register.
//   The destination can not be a memory target.
//   For each pair of 16 bit integers the calculation is:
//    (source[i] + dest[i] + 1)/2
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PAVGW,
//   // ( [RBX][15:0]    + XMM1[15:0]    + 1)/2 -> XMM1[15:0]
//   // ( [RBX][31:16]   + XMM1[31:16]   + 1)/2 -> XMM1[31:16]
//   // ( [RBX][47:32]   + XMM1[47:32]   + 1)/2 -> XMM1[47:32]
//   // ( [RBX][63:48]   + XMM1[63:48]   + 1)/2 -> XMM1[63:48]
//   // ( [RBX][79:64]   + XMM1[79:64]   + 1)/2 -> XMM1[79:64]
//   // ( [RBX][95:80]   + XMM1[95:80]   + 1)/2 -> XMM1[95:80]
//   // ( [RBX][111:96]  + XMM1[111:96]  + 1)/2 -> XMM1[111:96]
//   // ( [RBX][127:112] + XMM1[127:112] + 1)/2 -> XMM1[127:112]
//
//  RAX [R]  ST3   PAVGW,
//   // ( [RAX][15:0]    + ST3[15:0]    + 1)/2 -> ST3[15:0]
//   // ( [RAX][31:16]   + ST3[31:16]   + 1)/2 -> ST3[31:16]
//   // ( [RAX][47:32]   + ST3[47:32]   + 1)/2 -> ST3[47:32]
//   // ( [RAX][63:48]   + ST3[63:48]   + 1)/2 -> ST3[63:48]
//
//  XMM2  XMM1  PAVGW,
//   // ( XMM2[15:0]    + XMM1[15:0]    + 1)/2 -> XMM1[15:0]
//   // ( XMM2[31:16]   + XMM1[31:16]   + 1)/2 -> XMM1[31:16]
//   // ( XMM2[47:32]   + XMM1[47:32]   + 1)/2 -> XMM1[47:32]
//   // ( XMM2[63:48]   + XMM1[63:48]   + 1)/2 -> XMM1[63:48]
//   // ( XMM2[79:64]   + XMM1[79:64]   + 1)/2 -> XMM1[79:64]
//   // ( XMM2[95:80]   + XMM1[95:80]   + 1)/2 -> XMM1[95:80]
//   // ( XMM2[111:96]  + XMM1[111:96]  + 1)/2 -> XMM1[111:96]
//   // ( XMM2[127:112] + XMM1[127:112] + 1)/2 -> XMM1[127:112]
//
//  ST2  ST1 PAVGW,
//   // ( ST2[15:0]    + ST1[15:0]    + 1)/2 -> ST1[15:0]
//   // ( ST2[31:16]   + ST1[31:16]   + 1)/2 -> ST1[31:16]
//   // ( ST2[47:32]   + ST1[47:32]   + 1)/2 -> ST1[47:32]
//   // ( ST2[63:48]   + ST1[63:48]   + 1)/2 -> ST1[63:48]
//
//  ST1  ST2  <-  PAVGW,
//   // ( ST2[15:0]    + ST1[15:0]    + 1)/2 -> ST1[15:0]
//   // ( ST2[31:16]   + ST1[31:16]   + 1)/2 -> ST1[31:16]
//   // ( ST2[47:32]   + ST1[47:32]   + 1)/2 -> ST1[47:32]
//   // ( ST2[63:48]   + ST1[63:48]   + 1)/2 -> ST1[63:48]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpavgwcomma ( VPAVGW, )
//
// C prototype:
//  void dg_forthvpavgwcomma (Bufferhandle* pBHarrayhead)
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPAVGW instruction. This opcode sequence gets the average of each
//   unsigned 16 bit integer in the memory, xmm register, or ymm
//   register source and the corresponding unsigned 16 bit integer in the xmm
//   register or ymm register in target y and puts the results
//   into the destination. If the source target is not a memory target,
//   all three targets must be the same type of register.
//   The destination can not be a memory target.
//   For each pair of 16 bit integers the calculation is:
//    (source[i] + targety[i] + 1)/2 -> dest[i]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPAVGW,
//   // ( [RBX][15:0]    + XMM0[15:0]    + 1)/2 -> XMM1[15:0]
//   // ( [RBX][31:16]   + XMM0[31:16]   + 1)/2 -> XMM1[31:16]
//   // ( [RBX][47:32]   + XMM0[47:32]   + 1)/2 -> XMM1[47:32]
//   // ( [RBX][63:48]   + XMM0[63:48]   + 1)/2 -> XMM1[63:48]
//   // ( [RBX][79:64]   + XMM0[79:64]   + 1)/2 -> XMM1[79:64]
//   // ( [RBX][95:80]   + XMM0[95:80]   + 1)/2 -> XMM1[95:80]
//   // ( [RBX][111:96]  + XMM0[111:96]  + 1)/2 -> XMM1[111:96]
//   // ( [RBX][127:112] + XMM0[127:112] + 1)/2 -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPAVGW,
//   // ( YMM2[15:0]    + YMM0[15:0]    + 1)/2 -> YMM1[15:0]
//   // ( YMM2[31:16]   + YMM0[31:16]   + 1)/2 -> YMM1[31:16]
//   // ( YMM2[47:32]   + YMM0[47:32]   + 1)/2 -> YMM1[47:32]
//   // ( YMM2[63:48]   + YMM0[63:48]   + 1)/2 -> YMM1[63:48]
//   // ( YMM2[79:64]   + YMM0[79:64]   + 1)/2 -> YMM1[79:64]
//   // ( YMM2[95:80]   + YMM0[95:80]   + 1)/2 -> YMM1[95:80]
//   // ( YMM2[111:96]  + YMM0[111:96]  + 1)/2 -> YMM1[111:96]
//   // ( YMM2[127:112] + YMM0[127:112] + 1)/2 -> YMM1[127:112]
//   // ...
//   // ( YMM2[255:240] + YMM0[255:240] + 1)/2 -> YMM1[255:240]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpblendvbcomma ( PBLENDVB, )
//
// C prototype:
//  void dg_forthpblendvbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PBLENDVB instruction. This sequence copies bytes from the source
//   to the destination if the high bit for the same byte in register XMM0 is
//   set.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PBLENDVB,     // if XMM0[7]   is set then [RBX][7:0]     -> XMM1[7:0]
//                               // if XMM0[15]  is set then [RBX][15:8]    -> XMM1[15:8]
//                               // if XMM0[23]  is set then [RBX][23:16]   -> XMM1[23:16]
//                               // if XMM0[31]  is set then [RBX][31:24]   -> XMM1[31:24]
//                               // if XMM0[39]  is set then [RBX][39:32]   -> XMM1[39:32]
//                               // if XMM0[47]  is set then [RBX][47:40]   -> XMM1[47:40]
//                               // if XMM0[55]  is set then [RBX][55:48]   -> XMM1[55:48]
//                               // if XMM0[63]  is set then [RBX][63:56]   -> XMM1[63:56]
//                               // if XMM0[71]  is set then [RBX][71:64]   -> XMM1[71:64]
//                               // if XMM0[79]  is set then [RBX][79:72]   -> XMM1[79:72]
//                               // if XMM0[87]  is set then [RBX][87:80]   -> XMM1[87:80]
//                               // if XMM0[95]  is set then [RBX][95:88]   -> XMM1[95:88]
//                               // if XMM0[103] is set then [RBX][103:96]  -> XMM1[103:96]
//                               // if XMM0[111] is set then [RBX][111:104] -> XMM1[111:104]
//                               // if XMM0[119] is set then [RBX][119:112] -> XMM1[119:112]
//                               // if XMM0[127] is set then [RBX][127:120] -> XMM1[127:120]
//
//  XMM2  XMM1  PBLENDVB,        // if XMM0[7]   is set then XMM2[7:0]     -> XMM1[7:0]
//                               // if XMM0[15]  is set then XMM2[15:8]    -> XMM1[15:8]
//                               // if XMM0[23]  is set then XMM2[23:16]   -> XMM1[23:16]
//                               // if XMM0[31]  is set then XMM2[31:24]   -> XMM1[31:24]
//                               // if XMM0[39]  is set then XMM2[39:32]   -> XMM1[39:32]
//                               // if XMM0[47]  is set then XMM2[47:40]   -> XMM1[47:40]
//                               // if XMM0[55]  is set then XMM2[55:48]   -> XMM1[55:48]
//                               // if XMM0[63]  is set then XMM2[63:56]   -> XMM1[63:56]
//                               // if XMM0[71]  is set then XMM2[71:64]   -> XMM1[71:64]
//                               // if XMM0[79]  is set then XMM2[79:72]   -> XMM1[79:72]
//                               // if XMM0[87]  is set then XMM2[87:80]   -> XMM1[87:80]
//                               // if XMM0[95]  is set then XMM2[95:88]   -> XMM1[95:88]
//                               // if XMM0[103] is set then XMM2[103:96]  -> XMM1[103:96]
//                               // if XMM0[111] is set then XMM2[111:104] -> XMM1[111:104]
//                               // if XMM0[119] is set then XMM2[119:112] -> XMM1[119:112]
//                               // if XMM0[127] is set then XMM2[127:120] -> XMM1[127:120]
//
//  XMM2 <-  XMM1  PBLENDVB,  // if XMM0[7]   is set then XMM1[7:0]     -> XMM2[7:0]
//                               // if XMM0[15]  is set then XMM1[15:8]    -> XMM2[15:8]
//                               // if XMM0[23]  is set then XMM1[23:16]   -> XMM2[23:16]
//                               // if XMM0[31]  is set then XMM1[31:24]   -> XMM2[31:24]
//                               // if XMM0[39]  is set then XMM1[39:32]   -> XMM2[39:32]
//                               // if XMM0[47]  is set then XMM1[47:40]   -> XMM2[47:40]
//                               // if XMM0[55]  is set then XMM1[55:48]   -> XMM2[55:48]
//                               // if XMM0[63]  is set then XMM1[63:56]   -> XMM2[63:56]
//                               // if XMM0[71]  is set then XMM1[71:64]   -> XMM2[71:64]
//                               // if XMM0[79]  is set then XMM1[79:72]   -> XMM2[79:72]
//                               // if XMM0[87]  is set then XMM1[87:80]   -> XMM2[87:80]
//                               // if XMM0[95]  is set then XMM1[95:88]   -> XMM2[95:88]
//                               // if XMM0[103] is set then XMM1[103:96]  -> XMM2[103:96]
//                               // if XMM0[111] is set then XMM1[111:104] -> XMM2[111:104]
//                               // if XMM0[119] is set then XMM1[119:112] -> XMM2[119:112]
//                               // if XMM0[127] is set then XMM1[127:120] -> XMM2[127:120]
//
//  XMM1  XMM8  PBLENDVB,        // if XMM0[7]   is set then XMM1[7:0]     -> XMM8[7:0]
//                               // if XMM0[15]  is set then XMM1[15:8]    -> XMM8[15:8]
//                               // if XMM0[23]  is set then XMM1[23:16]   -> XMM8[23:16]
//                               // if XMM0[31]  is set then XMM1[31:24]   -> XMM8[31:24]
//                               // if XMM0[39]  is set then XMM1[39:32]   -> XMM8[39:32]
//                               // if XMM0[47]  is set then XMM1[47:40]   -> XMM8[47:40]
//                               // if XMM0[55]  is set then XMM1[55:48]   -> XMM8[55:48]
//                               // if XMM0[63]  is set then XMM1[63:56]   -> XMM8[63:56]
//                               // if XMM0[71]  is set then XMM1[71:64]   -> XMM8[71:64]
//                               // if XMM0[79]  is set then XMM1[79:72]   -> XMM8[79:72]
//                               // if XMM0[87]  is set then XMM1[87:80]   -> XMM8[87:80]
//                               // if XMM0[95]  is set then XMM1[95:88]   -> XMM8[95:88]
//                               // if XMM0[103] is set then XMM1[103:96]  -> XMM8[103:96]
//                               // if XMM0[111] is set then XMM1[111:104] -> XMM8[111:104]
//                               // if XMM0[119] is set then XMM1[119:112] -> XMM8[119:112]
//                               // if XMM0[127] is set then XMM1[127:120] -> XMM8[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpeqbcomma ( PCMPEQB, )
//
// C prototype:
//  void dg_forthpcmpeqbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPEQB instruction. This opcode sequence compares each
//   byte in the memory, xmm register, or floating point register
//   source with the corresponding byte in the xmm register or
//   floating point register destination. If they are equal, the destination
//   byte is changed to 0xFF, otherwise it is changed to 0. If the source target
//   is not a memory target, both targets must be the same type of register.
//   The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPEQB,
//   // if ( [RBX][7:0]      = XMM1[7:0] ) then 255 -> XMM1[7:0]
//   //  else 0 -> XMM1[7:0]
//   // if ( [RBX][15:8]    = XMM1[15:8] ) then 255 -> XMM1[15:8]
//   //  else 0 -> XMM1[15:8]
//   // ...
//   // if ( [RBX][127:120] = XMM1[127:120] ) then 255 -> XMM1[127:120]
//   //  else 0 -> XMM1[127:120]
//
//  RAX [R]  ST3   PCMPEQB,
//   // if ( [RAX][7:0]     = ST3[7:0]  ) then 255 -> ST3[7:0]
//   //  else 0 -> ST3[7:0]
//   // ...
//   // if ( [RAX][63:56]   = ST3[63:56] ) then 255  -> ST3[63:56]
//   //  else 0 -> ST3[63:56]
//
//  XMM2  XMM1  PCMPEQB,
//   // if ( XMM2[7:0]      = XMM1[7:0] ) then 255 -> XMM1[7:0]
//   //  else 0 -> XMM1[7:0]
//   // if ( XMM2[15:8]    = XMM1[15:8] ) then 255 -> XMM1[15:8]
//   //  else 0 -> XMM1[15:8]
//   // ...
//   // if ( XMM2[127:120] = XMM1[127:120] ) then 255 -> XMM1[127:120]
//   //  else 0 -> XMM1[127:120]
//
//  ST2  ST1 PCMPEQB,
//   // if ( ST2[7:0]     = ST1[7:0]  ) then 255 -> ST1[7:0]
//   //  else 0 -> ST1[7:0]
//   // ...
//   // if ( ST2[63:56]   = ST1[63:56] ) then 255  -> ST1[63:56]
//   //  else 0 -> ST1[63:56]
//
//  ST1  ST2  <-  PCMPEQB,
//   // if ( ST2[7:0]     = ST1[7:0]  ) then 255 -> ST1[7:0]
//   //  else 0 -> ST1[7:0]
//   // ...
//   // if ( ST2[63:56]   = ST1[63:56] ) then 255  -> ST1[63:56]
//   //  else 0 -> ST1[63:56]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpeqdcomma ( PCMPEQD, )
//
// C prototype:
//  void dg_forthpcmpeqdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPEQD instruction. This opcode sequence compares each
//   32 bit integer in the memory, xmm register, or floating point register
//   source with the corresponding 32 bit integer in the xmm register or
//   floating point register destination. If they are equal, the destination
//   32 bit integer is changed to 0xFFFFFFFF, otherwise it is changed to 0. If
//   the source target is not a memory target, both targets must be the same
//   type of register.
//   The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPEQD,
//   // if ( [RBX][31:0]      = XMM1[31:0] ) then -1 -> XMM1[31:0]
//   //  else 0 -> XMM1[31:0]
//   // if ( [RBX][63:32]    = XMM1[63:32] ) then -1 -> XMM1[63:32]
//   //  else 0 -> XMM1[63:32]
//   // ...
//   // if ( [RBX][127:96] = XMM1[127:96] ) then -1 -> XMM1[127:96]
//   //  else 0 -> XMM1[127:96]
//
//  RAX [R]  ST3   PCMPEQD,
//   // if ( [RAX][31:0]     = ST3[31:0]  ) then -1 -> ST3[31:0]
//   //  else 0 -> ST3[31:0]
//   // if ( [RAX][63:32]   = ST3[63:32] ) then -1  -> ST3[63:32]
//   //  else 0 -> ST3[63:32]
//
//  XMM2  XMM1  PCMPEQD,
//   // if ( XMM2[31:0]      = XMM1[31:0] ) then -1 -> XMM1[31:0]
//   //  else 0 -> XMM1[31:0]
//   // if ( XMM2[63:32]    = XMM1[63:32] ) then -1 -> XMM1[63:32]
//   //  else 0 -> XMM1[63:32]
//   // ...
//   // if ( XMM2[127:96] = XMM1[127:96] ) then -1 -> XMM1[127:96]
//   //  else 0 -> XMM1[127:96]
//
//  ST2  ST1 PCMPEQD,
//   // if ( ST2[31:0]     = ST1[31:0]  ) then -1 -> ST1[31:0]
//   //  else 0 -> ST1[31:0]
//   // if ( ST2[63:32]   = ST1[63:32] ) then -1  -> ST1[63:32]
//   //  else 0 -> ST1[63:32]
//
//  ST1  ST2  <-  PCMPEQD,
//   // if ( ST2[31:0]     = ST1[31:0]  ) then -1 -> ST1[31:0]
//   //  else 0 -> ST1[31:0]
//   // if ( ST2[63:32]   = ST1[63:32] ) then -1  -> ST1[63:32]
//   //  else 0 -> ST1[63:32]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpeqqcomma ( PCMPEQQ, )
//
// C prototype:
//  void dg_forthpcmpeqqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPEQQ instruction. This opcode sequence compares each
//   64 bit integer out of 128 bits in the memory or xmm register
//   source with the corresponding 64 bit integers in the xmm register or
//   128 bit memory destination. If they are equal, the destination
//   64 bit integer is changed to 0xFFFFFFFFFFFFFFFF, otherwise it is changed
//   to 0.
//   The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPEQQ,
//   // if ( [RBX][63:0]      = XMM1[63:0] ) then 255 -> XMM1[63:0]
//   //  else 0 -> XMM1[63:0]
//   // if ( [RBX][127:64]    = XMM1[127:64] ) then 255 -> XMM1[127:64]
//   //  else 0 -> XMM1[127:64]
//
//  XMM2  XMM1  PCMPEQQ,
//   // if ( XMM2[63:0]      = XMM1[63:0] ) then 255 -> XMM1[63:0]
//   //  else 0 -> XMM1[63:0]
//   // if ( XMM2[127:64]    = XMM1[127:64] ) then 255 -> XMM1[127:64]
//   //  else 0 -> XMM1[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpeqwcomma ( PCMPEQW, )
//
// C prototype:
//  void dg_forthpcmpeqwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPEQW instruction. This opcode sequence compares each
//   16 bit integer in the memory, xmm register, or floating point register
//   source with the corresponding 16 bit integer in the xmm register or
//   floating point register destination. If they are equal, the destination
//   byte is changed to 0xFFFF, otherwise it is changed to 0. If the source
//   target is not a memory target, both targets must be the same type of
//   register.
//   The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPEQW,
//   // if ( [RBX][15:0]      = XMM1[15:0] ) then -1 -> XMM1[15:0]
//   //  else 0 -> XMM1[15:0]
//   // if ( [RBX][31:16]    = XMM1[31:16] ) then -1 -> XMM1[31:16]
//   //  else 0 -> XMM1[31:16]
//   // ...
//   // if ( [RBX][127:112] = XMM1[127:112] ) then -1 -> XMM1[127:112]
//   //  else 0 -> XMM1[127:112]
//
//  RAX [R]  ST3   PCMPEQW,
//   // if ( [RAX][15:0]     = ST3[15:0]  ) then -1 -> ST3[15:0]
//   //  else 0 -> ST3[15:0]
//   // ...
//   // if ( [RAX][63:48]   = ST3[63:48] ) then -1  -> ST3[63:48]
//   //  else 0 -> ST3[63:48]
//
//  XMM2  XMM1  PCMPEQW,
//   // if ( XMM2[15:0]      = XMM1[15:0] ) then -1 -> XMM1[15:0]
//   //  else 0 -> XMM1[15:0]
//   // if ( XMM2[31:16]    = XMM1[31:16] ) then -1 -> XMM1[31:16]
//   //  else 0 -> XMM1[31:16]
//   // ...
//   // if ( XMM2[127:112] = XMM1[127:112] ) then -1 -> XMM1[127:112]
//   //  else 0 -> XMM1[127:112]
//
//  ST2  ST1 PCMPEQW,
//   // if ( ST2[15:0]     = ST1[15:0]  ) then -1 -> ST1[15:0]
//   //  else 0 -> ST1[15:0]
//   // ...
//   // if ( ST2[63:48]   = ST1[63:48] ) then -1  -> ST1[63:48]
//   //  else 0 -> ST1[63:48]
//
//  ST1  ST2  <-  PCMPEQW,
//   // if ( ST2[15:0]     = ST1[15:0]  ) then -1 -> ST1[15:0]
//   //  else 0 -> ST1[15:0]
//   // ...
//   // if ( ST2[63:48]   = ST1[63:48] ) then -1  -> ST1[63:48]
//   //  else 0 -> ST1[63:48]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpgtbcomma ( PCMPGTB, )
//
// C prototype:
//  void dg_forthpcmpgtbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPGTB instruction. This opcode sequence compares each
//   signed byte in the memory, xmm register, or floating point register
//   source with the corresponding signed byte in the xmm register or
//   floating point register destination. If the destination value is greater
//   than the source value, the destination byte
//   is changed to 0xFF, otherwise it is changed to 0. If the source target
//   is not a memory target, both targets must be the same type of register.
//   The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPGTB,
//   // if ( XMM1[7:0]     > [RBX][7:0] ) then 255 -> XMM1[7:0]
//   //  else 0 -> XMM1[7:0]
//   // if ( XMM1[15:8]    > [RBX][15:8] ) then 255 -> XMM1[15:8]
//   //  else 0 -> XMM1[15:8]
//   // ...
//   // if ( XMM1[127:120] > [RBX][127:120] ) then 255 -> XMM1[127:120]
//   //  else 0 -> XMM1[127:120]
//
//  RAX [R]  ST3   PCMPGTB,
//   // if ( ST3[7:0]     > [RAX][7:0] ) then 255 -> ST3[7:0]
//   //  else 0 -> ST3[7:0]
//   // ...
//   // if ( ST3[63:56]   > [RAX][63:56] ) then 255  -> ST3[63:56]
//   //  else 0 -> ST3[63:56]
//
//  XMM2  XMM1  PCMPGTB,
//   // if ( XMM1[7:0]      > XMM2[7:0] ) then 255 -> XMM1[7:0]
//   //  else 0 -> XMM1[7:0]
//   // if ( XMM1[15:8]    > XMM2[15:8] ) then 255 -> XMM1[15:8]
//   //  else 0 -> XMM1[15:8]
//   // ...
//   // if ( XMM1[127:120] > XMM2[127:120] ) then 255 -> XMM1[127:120]
//   //  else 0 -> XMM1[127:120]
//
//  ST2  ST1 PCMPGTB,
//   // if ( ST1[7:0]     > ST2[7:0]  ) then 255 -> ST1[7:0]
//   //  else 0 -> ST1[7:0]
//   // ...
//   // if ( ST1[63:56]   > ST2[63:56] ) then 255  -> ST1[63:56]
//   //  else 0 -> ST1[63:56]
//
//  ST1  ST2  <-  PCMPGTB,
//   // if ( ST1[7:0]     > ST2[7:0]  ) then 255 -> ST1[7:0]
//   //  else 0 -> ST1[7:0]
//   // ...
//   // if ( ST1[63:56]   > ST2[63:56] ) then 255  -> ST1[63:56]
//   //  else 0 -> ST1[63:56]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpgtdcomma ( PCMPGTD, )
//
// C prototype:
//  void dg_forthpcmpgtdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPGTD instruction. This opcode sequence compares each
//   signed 32 bit integer in the memory, xmm register, or floating point
//   register source with the corresponding signed 32 bit integer in the xmm
//   register or floating point register destination. If the destination value
//   is greater than the source value, the destination 32 bit integer is changed
//   to 0xFFFFFFFF, otherwise it is changed to 0. If the source target is not a
//    memory target, both targets must be the same type of register.
//   The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPGTD,
//   // if ( XMM1[31:0]      > [RBX][31:0] ) then -1 -> XMM1[31:0]
//   //  else 0 -> XMM1[31:0]
//   // if ( XMM1[63:32]    > [RBX][63:32] ) then -1 -> XMM1[63:32]
//   //  else 0 -> XMM1[63:32]
//   // ...
//   // if ( XMM1[127:96] > [RBX][127:96] ) then -1 -> XMM1[127:96]
//   //  else 0 -> XMM1[127:96]
//
//  RAX [R]  ST3   PCMPGTD,
//   // if ( ST3[31:0]     > [RAX][31:0] ) then -1 -> ST3[31:0]
//   //  else 0 -> ST3[31:0]
//   // if ( ST3[63:32]   > [RAX][63:32] ) then -1  -> ST3[63:32]
//   //  else 0 -> ST3[63:32]
//
//  XMM2  XMM1  PCMPGTD,
//   // if ( XMM1[31:0]   > XMM2[31:0] )  then -1 -> XMM1[31:0]
//   //  else 0 -> XMM1[31:0]
//   // if ( XMM1[63:32]  > XMM2[63:32] ) then -1 -> XMM1[63:32]
//   //  else 0 -> XMM1[63:32]
//   // ...
//   // if ( XMM1[127:96] > XMM2[127:96] ) then -1 -> XMM1[127:96]
//   //  else 0 -> XMM1[127:96]
//
//  ST2  ST1 PCMPGTD,
//   // if ( ST1[31:0]    > ST2[31:0]  ) then -1 -> ST1[31:0]
//   //  else 0 -> ST1[31:0]
//   // if ( ST1[63:32]   > ST2[63:32] ) then -1  -> ST1[63:32]
//   //  else 0 -> ST1[63:32]
//
//  ST1  ST2  <-  PCMPGTD,
//   // if ( ST1[31:0]     > ST2[31:0]  ) then -1 -> ST1[31:0]
//   //  else 0 -> ST1[31:0]
//   // if ( ST1[63:32]   > ST2[63:32] ) then -1  -> ST1[63:32]
//   //  else 0 -> ST1[63:32]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpgtwcomma ( PCMPGTW, )
//
// C prototype:
//  void dg_forthpcmpgtwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPGTW instruction. This opcode sequence compares each
//   16 bit signed integer in the memory, xmm register, or floating point
//   register source with the corresponding signed 16 bit integer in the xmm
//   register or floating point register destination. If the destination is
//   greater than the source, the destination value is changed to 0xFFFF,
//   otherwise it is changed to 0. If the source target is not a memory target,
//   both targets must be the same type of register.
//   The destination can not be a memory target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPGTW,
//   // if ( XMM1[15:0]      > [RBX][15:0] ) then -1 -> XMM1[15:0]
//   //  else 0 -> XMM1[15:0]
//   // if ( XMM1[31:16]    > [RBX][31:16] ) then -1 -> XMM1[31:16]
//   //  else 0 -> XMM1[31:16]
//   // ...
//   // if ( XMM1[127:112] > [RBX][127:112] ) then -1 -> XMM1[127:112]
//   //  else 0 -> XMM1[127:112]
//
//  RAX [R]  ST3   PCMPGTW,
//   // if ( ST3[15:0]     > [RAX][15:0]  ) then -1 -> ST3[15:0]
//   //  else 0 -> ST3[15:0]
//   // ...
//   // if ( ST3[63:48]   > [RAX][63:48] ) then -1  -> ST3[63:48]
//   //  else 0 -> ST3[63:48]
//
//  XMM2  XMM1  PCMPGTW,
//   // if ( XMM1[15:0]      > XMM2[15:0] ) then -1 -> XMM1[15:0]
//   //  else 0 -> XMM1[15:0]
//   // if ( XMM1[31:16]    > XMM2[31:16] ) then -1 -> XMM1[31:16]
//   //  else 0 -> XMM1[31:16]
//   // ...
//   // if ( XMM1[127:112] > XMM2[127:112] ) then -1 -> XMM1[127:112]
//   //  else 0 -> XMM1[127:112]
//
//  ST2  ST1 PCMPGTW,
//   // if ( ST1[15:0]     > ST2[15:0]  ) then -1 -> ST1[15:0]
//   //  else 0 -> ST1[15:0]
//   // ...
//   // if ( ST1[63:48]   > ST2[63:48] ) then -1  -> ST1[63:48]
//   //  else 0 -> ST1[63:48]
//
//  ST1  ST2  <-  PCMPGTW,
//   // if ( ST1[15:0]     > ST2[15:0]  ) then -1 -> ST1[15:0]
//   //  else 0 -> ST1[15:0]
//   // ...
//   // if ( ST1[63:48]   > ST2[63:48] ) then -1  -> ST1[63:48]
//   //  else 0 -> ST1[63:48]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpgtqcomma ( PCMPGTQ, )
//
// C prototype:
//  void dg_forthpcmpgtqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPGTQ instruction. This sequence compares the high 64 bit signed
//   integer in the destination with the high 64 bit signed integer in the source
//   and if the destination value is greater than the source value, the
//   high 64 bits in the destination are changed to all 1s, otherwise they are
//   changed to all 0s. The same thing is done for the low 64 bits of the source
//   and destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PCMPGTQ,     // if XMM1[63:0] > [RBX][63:0] then
//                              //  -1 -> XMM1[63:0] else 0 -> XMM1[63:0]
//                              // if XMM0[127:64] > [RBX][127:64] then
//                              //  -1 -> XMM1[127:64] else 0 -> XMM1[127:64]
//
//  XMM2  XMM1  PCMPGTQ,        // if XMM1[63:0] > XMM2[63:0] then
//                              //  -1 -> XMM1[63:0] else 0 -> XMM1[63:0]
//                              // if XMM1[127:64] > XMM2[127:64] then
//                              //  -1 -> XMM1[127:64] else 0 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PCMPGTQ,  // if XMM2[63:0] > XMM1[63:0] then
//                              //  -1 -> XMM2[63:0] else 0 -> XMM2[63:0]
//                              // if XMM2[127:64] > XMM1[127:64] then
//                              //  -1 -> XMM2[127:64] else 0 -> XMM2[127:64]
//
//  XMM1  XMM8  PCMPGTQ,        // if XMM8[63:0] > XMM1[63:0] then
//                              //  -1 -> XMM8[63:0] else 0 -> XMM8[63:0]
//                              // if XMM8[127:64] > XMM1[127:64] then
//                              //  -1 -> XMM8[127:64] else 0 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthphminposuwcomma ( PHMINPOSUW, )
//
// C prototype:
//  void dg_forthphminposuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PHMINPOSUW instruction. This sequence treats the source as an array
//   of 16 bit unsigned integers, finds the minimum value in that array, and
//   the 0 based index of that value. The minimum value is stored in the lowest
//   16 bits of the destination. The index is stored in bits 18, 17, and 16.
//   The rest of the bits in the destination are cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PHMINPOSUW,     // minimum( [RBX][15:0], [RBX][31:16], [RBX][47:32],
//                                 //  [RBX][63:48], [RBX][79:64], [RBX][95:80],
//                                 //  [RBX][111:96], [RBX][127:112]) -> XMM1[15:0]
//                                 //  indexofminimumvalue -> XMM1[18:16]
//
//  XMM2  XMM1  PHMINPOSUW,        // minimum( XMM2[15:0], XMM2[31:16], XMM2[47:32],
//                                 //  XMM2[63:48], XMM2[79:64], XMM2[95:80],
//                                 //  XMM2[111:96], XMM2[127:112]) -> XMM1[15:0]
//                                 //  indexofminimumvalue -> XMM1[18:16]
//
//  XMM2 <-  XMM1  PHMINPOSUW,  // minimum( XMM1[15:0], XMM1[31:16], XMM1[47:32],
//                                 //  XMM1[63:48], XMM1[79:64], XMM1[95:80],
//                                 //  XMM1[111:96], XMM1[127:112]) -> XMM2[15:0]
//                                 //  indexofminimumvalue -> XMM2[18:16]
//
//  XMM1  XMM8  PHMINPOSUW,        // minimum( XMM1[15:0], XMM1[31:16], XMM1[47:32],
//                                 //  XMM1[63:48], XMM1[79:64], XMM1[95:80],
//                                 //  XMM1[111:96], XMM1[127:112]) -> XMM8[15:0]
//                                 //  indexofminimumvalue -> XMM8[18:16]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvphminposuwcomma ( VPHMINPOSUW, )
//
// C prototype:
//  void dg_forthvphminposuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPHMINPOSUW instruction. This sequence treats the source as an array
//   of 16 bit unsigned integers, finds the minimum value in that array, and
//   the 0 based index of that value. The minimum value is stored in the lowest
//   16 bits of the destination. The index is stored in bits 18, 17, and 16.
//   The rest of the bits in the destination are cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPHMINPOSUW,    // minimum( [RBX][15:0], [RBX][31:16], [RBX][47:32],
//                                 //  [RBX][63:48], [RBX][79:64], [RBX][95:80],
//                                 //  [RBX][111:96], [RBX][127:112]) -> XMM1[15:0]
//                                 //  indexofminimumvalue -> XMM1[18:16]
//
//  XMM2  XMM1  VPHMINPOSUW,       // minimum( XMM2[15:0], XMM2[31:16], XMM2[47:32],
//                                 //  XMM2[63:48], XMM2[79:64], XMM2[95:80],
//                                 //  XMM2[111:96], XMM2[127:112]) -> XMM1[15:0]
//                                 //  indexofminimumvalue -> XMM1[18:16]
//
//  XMM2 <-  XMM1  VPHMINPOSUW, // minimum( XMM1[15:0], XMM1[31:16], XMM1[47:32],
//                                 //  XMM1[63:48], XMM1[79:64], XMM1[95:80],
//                                 //  XMM1[111:96], XMM1[127:112]) -> XMM2[15:0]
//                                 //  indexofminimumvalue -> XMM2[18:16]
//
//  XMM1  XMM8  VPHMINPOSUW,       // minimum( XMM1[15:0], XMM1[31:16], XMM1[47:32],
//                                 //  XMM1[63:48], XMM1[79:64], XMM1[95:80],
//                                 //  XMM1[111:96], XMM1[127:112]) -> XMM8[15:0]
//                                 //  indexofminimumvalue -> XMM8[18:16]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaddubswcomma ( VPMADDUBSW, )
//
// C prototype:
//  void dg_forthvpmaddubswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMADDUBSW instruction. This sequence multiplies each unsigned byte 
//   integer in target y by each signed byte integer in the source then pairs of 
//   results are added to produce signed 16 bit final result which is put into 
//   the destination. If the final results are more or less than what will fit 
//   into signed 16 bit integers, the final results are clipped to the largest 
//   or smallest possible 16 bit integer.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMADDUBSW,    
//                           // (XMM0[7:0]     * [RBX][7:0]) +    
//                           // (XMM0[15:8]    * [RBX][15:8])   -> XMM1[15:0]
//                           // (XMM0[23:16]   * [RBX][23:16]) +    
//                           // (XMM0[31:24]   * [RBX][31:24])  -> XMM1[23:16]
//                           // ...
//                           // (XMM0[119:112] * [RBX][119:112]) + 
//                           // (XMM0[127:120] * [RBX][127:120]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMADDUBSW,       
//                           // (YMM0[7:0]     * YMM2[7:0]) +    
//                           // (YMM0[15:8]    * YMM2[15:8])    -> YMM1[15:0]
//                           // (YMM0[23:16]   * YMM2[23:16]) +    
//                           // (YMM0[31:24]   * YMM2[31:24])   -> YMM1[23:16]
//                           // ...
//                           // (YMM0[247:240] * YMM2[247:240]) + 
//                           // (YMM0[255:248] * YMM2[255:248]) -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPMADDUBSW, 
//                           // (XMM0[7:0]     * XMM2[7:0]) +    
//                           // (XMM0[15:8]    * XMM2[15:8])    -> XMM1[15:0]
//                           // (XMM0[23:16]   * XMM2[23:16]) +    
//                           // (XMM0[31:24]   * XMM2[31:24])   -> XMM1[23:16]
//                           // ...
//                           // (XMM0[119:112] * XMM2[119:112]) + 
//                           // (XMM0[127:120] * XMM2[127:120]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaddwdcomma ( VPMADDWD, )
//
// C prototype:
//  void dg_forthvpmaddwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMADDWD instruction. This sequence multiplies each signed 16 bit 
//   integer in target y by each signed 16 bit integer in the source then pairs of 
//   results are added to produce signed 32 bit final results which are put into 
//   the destination. If both values of a pair are the most negative integer 16 
//   bit  integer possible, the most negative 32 bit integer result is returned.
//   Intel docs say no other cases will overflow.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMADDWD,    
//                           // (XMM0[15:0]    * [RBX][15:0]) +    
//                           // (XMM0[31:16]   * [RBX][31:16])  -> XMM1[31:0]
//                           // (XMM0[47:32]   * [RBX][47:32]) +    
//                           // (XMM0[63:48]   * [RBX][63:48])  -> XMM1[63:32]
//                           // ...
//                           // (XMM0[111:96]  * [RBX][111:96]) + 
//                           // (XMM0[127:112] * [RBX][127:112]) -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPMADDWD,       
//                           // (YMM0[15:0]    * YMM2[15:0]) +    
//                           // (YMM0[31:16]   * YMM2[31:16])   -> YMM1[31:0]
//                           // (YMM0[47:32]   * YMM2[47:32]) +    
//                           // (YMM0[63:48]   * YMM2[63:48])   -> YMM1[63:32]
//                           // ...
//                           // (YMM0[239:224] * YMM2[239:224]) + 
//                           // (YMM0[255:240] * YMM2[255:240]) -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPMADDWD, 
//                           // (XMM0[15:0]    * XMM2[15:0]) +    
//                           // (XMM0[31:16]   * XMM2[31:16])   -> XMM1[31:0]
//                           // (XMM0[47:32]   * XMM2[47:32]) +    
//                           // (XMM0[63:48]   * XMM2[63:48])   -> XMM1[63:32]
//                           // ...
//                           // (XMM0[111:96]  * XMM2[111:96]) + 
//                           // (XMM0[127:112] * XMM2[127:112]) -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaxsbcomma ( VPMAXSB, )
//
// C prototype:
//  void dg_forthvpmaxsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMAXSB instruction. This sequence compares each signed byte
//   integer from target y with each corresponding signed byte integer from  
//   the source and copies the largest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMAXSB,    
//                           // MAX(XMM0[7:0],     [RBX][7:0])     -> XMM1[7:0]    
//                           // MAX(XMM0[15:8],    [RBX][15:8])    -> XMM1[15:8]
//                           // ...
//                           // MAX(XMM0[127:120], [RBX][127:120]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMAXSB,       
//                           // MAX(YMM0[7:0],     YMM2[7:0])     -> YMM1[7:0]    
//                           // MAX(YMM0[15:8],    YMM2[15:8])    -> YMM1[15:8]
//                           // ...
//                           // MAX(YMM0[255:247], YMM2[255:247]) -> YMM1[[255:247]
//
//  XMM1 <-  XMM0  XMM2  VPMAXSB, 
//                           // MAX(XMM0[7:0],     XMM2[7:0])     -> XMM1[7:0]    
//                           // MAX(XMM0[15:8],    XMM2[15:8])    -> XMM1[15:8]
//                           // ...
//                           // MAX(XMM0[127:120], XMM2[127:120]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaxubcomma ( VPMAXUB, )
//
// C prototype:
//  void dg_forthvpmaxubcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMAXUB instruction. This sequence compares each unsigned byte
//   integer from target y with each corresponding unsigned byte integer from 
//   the source  and copies the largest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMAXUB,    
//                           // MAX(XMM0[7:0],     [RBX][7:0])     -> XMM1[7:0]    
//                           // MAX(XMM0[15:8],    [RBX][15:8])    -> XMM1[15:8]
//                           // ...
//                           // MAX(XMM0[127:120], [RBX][127:120]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMAXUB,       
//                           // MAX(YMM0[7:0],     YMM2[7:0])     -> YMM1[7:0]    
//                           // MAX(YMM0[15:8],    YMM2[15:8])    -> YMM1[15:8]
//                           // ...
//                           // MAX(YMM0[255:247], YMM2[255:247]) -> YMM1[[255:247]
//
//  XMM1 <-  XMM0  XMM2  VPMAXUB, 
//                           // MAX(XMM0[7:0],     XMM2[7:0])     -> XMM1[7:0]    
//                           // MAX(XMM0[15:8],    XMM2[15:8])    -> XMM1[15:8]
//                           // ...
//                           // MAX(XMM0[127:120], XMM2[127:120]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpminsbcomma ( VPMINSB, )
//
// C prototype:
//  void dg_forthvpminsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMINSB instruction. This sequence compares each signed byte
//   integer from target y with each corresponding signed byte integer from  
//   the source and copies the smallest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMINSB,    
//                           // MIN(XMM0[7:0],     [RBX][7:0])     -> XMM1[7:0]    
//                           // MIN(XMM0[15:8],    [RBX][15:8])    -> XMM1[15:8]
//                           // ...
//                           // MIN(XMM0[127:120], [RBX][127:120]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMINSB,       
//                           // MIN(YMM0[7:0],     YMM2[7:0])     -> YMM1[7:0]    
//                           // MIN(YMM0[15:8],    YMM2[15:8])    -> YMM1[15:8]
//                           // ...
//                           // MIN(YMM0[255:247], YMM2[255:247]) -> YMM1[[255:247]
//
//  XMM1 <-  XMM0  XMM2  VPMINSB, 
//                           // MIN(XMM0[7:0],     XMM2[7:0])     -> XMM1[7:0]    
//                           // MIN(XMM0[15:8],    XMM2[15:8])    -> XMM1[15:8]
//                           // ...
//                           // MIN(XMM0[127:120], XMM2[127:120]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpminubcomma ( VPMINUB, )
//
// C prototype:
//  void dg_forthvpminubcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMINUB instruction. This sequence compares each unsigned byte
//   integer from target y with each corresponding unsigned byte integer from  
//   the source and copies the smallest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMINUB,    
//                           // MIN(XMM0[7:0],     [RBX][7:0])     -> XMM1[7:0]    
//                           // MIN(XMM0[15:8],    [RBX][15:8])    -> XMM1[15:8]
//                           // ...
//                           // MIN(XMM0[127:120], [RBX][127:120]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMINUB,       
//                           // MIN(YMM0[7:0],     YMM2[7:0])     -> YMM1[7:0]    
//                           // MIN(YMM0[15:8],    YMM2[15:8])    -> YMM1[15:8]
//                           // ...
//                           // MIN(YMM0[255:247], YMM2[255:247]) -> YMM1[[255:247]
//
//  XMM1 <-  XMM0  XMM2  VPMINUB, 
//                           // MIN(XMM0[7:0],     XMM2[7:0])     -> XMM1[7:0]    
//                           // MIN(XMM0[15:8],    XMM2[15:8])    -> XMM1[15:8]
//                           // ...
//                           // MIN(XMM0[127:120], XMM2[127:120]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaxsdcomma ( VPMAXSD, )
//
// C prototype:
//  void dg_forthvpmaxsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMAXSD instruction. This sequence compares each signed 32 bit
//   integer from target y with each corresponding signed 32 bit integer from  
//   the source and copies the largest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMAXSD,    
//                           // MAX(XMM0[31:0],    [RBX][31:0])    -> XMM1[31:0]    
//                           // MAX(XMM0[63:32],   [RBX][63:32])   -> XMM1[63:32]
//                           // ...
//                           // MAX(XMM0[127:196], [RBX][127:196]) -> XMM1[127:196]
//
//  YMM2  YMM0  YMM1  VPMAXSD,       
//                           // MAX(YMM0[31:0],    YMM2[31:0])    -> YMM1[31:0]    
//                           // MAX(YMM0[63:32],   YMM2[63:32])   -> YMM1[63:32]
//                           // ...
//                           // MAX(YMM0[255:224], YMM2[255:224]) -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPMAXSD, 
//                           // MAX(XMM0[31:0],    XMM2[31:0])    -> XMM1[31:0]    
//                           // MAX(XMM0[63:32],   XMM2[63:32])   -> XMM1[63:32]
//                           // ...
//                           // MAX(XMM0[127:196], XMM2[127:196]) -> XMM1[127:196]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaxudcomma ( VPMAXUD, )
//
// C prototype:
//  void dg_forthvpmaxudcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMAXUD instruction. This sequence compares each unsigned 32 bit
//   integer from target y with each corresponding unsigned 32 bit integer from  
//   the source and copies the largest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMAXUD,    
//                           // MAX(XMM0[31:0],    [RBX][31:0])    -> XMM1[31:0]    
//                           // MAX(XMM0[63:32],   [RBX][63:32])   -> XMM1[63:32]
//                           // ...
//                           // MAX(XMM0[127:196], [RBX][127:196]) -> XMM1[127:196]
//
//  YMM2  YMM0  YMM1  VPMAXUD,       
//                           // MAX(YMM0[31:0],    YMM2[31:0])    -> YMM1[31:0]    
//                           // MAX(YMM0[63:32],   YMM2[63:32])   -> YMM1[63:32]
//                           // ...
//                           // MAX(YMM0[255:224], YMM2[255:224]) -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPMAXUD, 
//                           // MAX(XMM0[31:0],    XMM2[31:0])    -> XMM1[31:0]    
//                           // MAX(XMM0[63:32],   XMM2[63:32])   -> XMM1[63:32]
//                           // ...
//                           // MAX(XMM0[127:196], XMM2[127:196]) -> XMM1[127:196]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpminsdcomma ( VPMINSD, )
//
// C prototype:
//  void dg_forthvpminsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMINSD instruction. This sequence compares each signed 32 bit
//   integer from target y with each corresponding signed 32 bit integer from  
//   the source and copies the smallest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMINSD,    
//                           // MIN(XMM0[31:0],    [RBX][31:0])    -> XMM1[31:0]    
//                           // MIN(XMM0[63:32],   [RBX][63:32])   -> XMM1[63:32]
//                           // ...
//                           // MIN(XMM0[127:196], [RBX][127:196]) -> XMM1[127:196]
//
//  YMM2  YMM0  YMM1  VPMINSD,       
//                           // MIN(YMM0[31:0],    YMM2[31:0])    -> YMM1[31:0]    
//                           // MIN(YMM0[63:32],   YMM2[63:32])   -> YMM1[63:32]
//                           // ...
//                           // MIN(YMM0[255:224], YMM2[255:224]) -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPMINSD, 
//                           // MIN(XMM0[31:0],    XMM2[31:0])    -> XMM1[31:0]    
//                           // MIN(XMM0[63:32],   XMM2[63:32])   -> XMM1[63:32]
//                           // ...
//                           // MIN(XMM0[127:196], XMM2[127:196]) -> XMM1[127:196]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpminudcomma ( VPMINUD, )
//
// C prototype:
//  void dg_forthvpminudcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMINUD instruction. This sequence compares each unsigned 32 bit
//   integer from target y with each corresponding unsigned 32 bit integer from  
//   the source and copies the smallest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMINUD,    
//                           // MIN(XMM0[31:0],    [RBX][31:0])    -> XMM1[31:0]    
//                           // MIN(XMM0[63:32],   [RBX][63:32])   -> XMM1[63:32]
//                           // ...
//                           // MIN(XMM0[127:196], [RBX][127:196]) -> XMM1[127:196]
//
//  YMM2  YMM0  YMM1  VPMINUD,       
//                           // MIN(YMM0[31:0],    YMM2[31:0])    -> YMM1[31:0]    
//                           // MIN(YMM0[63:32],   YMM2[63:32])   -> YMM1[63:32]
//                           // ...
//                           // MIN(YMM0[255:224], YMM2[255:224]) -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPMINUD, 
//                           // MIN(XMM0[31:0],    XMM2[31:0])    -> XMM1[31:0]    
//                           // MIN(XMM0[63:32],   XMM2[63:32])   -> XMM1[63:32]
//                           // ...
//                           // MIN(XMM0[127:196], XMM2[127:196]) -> XMM1[127:196]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaxswcomma ( VPMAXSW, )
//
// C prototype:
//  void dg_forthvpmaxswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMAXSW instruction. This sequence compares each signed 16 bit
//   integer from target y with each corresponding signed 16 bit integer from  
//   the source and copies the largest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMAXSW,    
//                           // MAX(XMM0[15:0],    [RBX][15:0])    -> XMM1[15:0]    
//                           // MAX(XMM0[31:16],   [RBX][31:16])   -> XMM1[31:16]
//                           // ...
//                           // MAX(XMM0[127:112], [RBX][127:112]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMAXSW,       
//                           // MAX(YMM0[15:0],    YMM2[15:0])    -> YMM1[15:0]    
//                           // MAX(YMM0[31:16],   YMM2[31:16])   -> YMM1[31:16]
//                           // ...
//                           // MAX(YMM0[255:240], YMM2[255:240]) -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPMAXSW, 
//                           // MAX(XMM0[15:0],    XMM2[15:0])    -> XMM1[15:0]    
//                           // MAX(XMM0[31:16],   XMM2[31:16])   -> XMM1[31:16]
//                           // ...
//                           // MAX(XMM0[127:112], XMM2[127:112]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmaxuwcomma ( VPMAXUW, )
//
// C prototype:
//  void dg_forthvpmaxuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMAXUW instruction. This sequence compares each unsigned 16 bit
//   integer from target y with each corresponding unsigned 16 bit integer from  
//   the source and copies the largest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMAXUW,    
//                           // MAX(XMM0[15:0],    [RBX][15:0])    -> XMM1[15:0]    
//                           // MAX(XMM0[31:16],   [RBX][31:16])   -> XMM1[31:16]
//                           // ...
//                           // MAX(XMM0[127:112], [RBX][127:112]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMAXUW,       
//                           // MAX(YMM0[15:0],    YMM2[15:0])    -> YMM1[15:0]    
//                           // MAX(YMM0[31:16],   YMM2[31:16])   -> YMM1[31:16]
//                           // ...
//                           // MAX(YMM0[255:240], YMM2[255:240]) -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPMAXUW, 
//                           // MAX(XMM0[15:0],    XMM2[15:0])    -> XMM1[15:0]    
//                           // MAX(XMM0[31:16],   XMM2[31:16])   -> XMM1[31:16]
//                           // ...
//                           // MAX(XMM0[127:112], XMM2[127:112]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpminswcomma ( VPMINSW, )
//
// C prototype:
//  void dg_forthvpminswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMINSW instruction. This sequence compares each signed 16 bit
//   integer from target y with each corresponding signed 16 bit integer from  
//   the source and copies the smallest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMINSW,    
//                           // MIN(XMM0[15:0],    [RBX][15:0])    -> XMM1[15:0]    
//                           // MIN(XMM0[31:16],   [RBX][31:16])   -> XMM1[31:16]
//                           // ...
//                           // MIN(XMM0[127:112], [RBX][127:112]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMINSW,       
//                           // MIN(YMM0[15:0],    YMM2[15:0])    -> YMM1[15:0]    
//                           // MIN(YMM0[31:16],   YMM2[31:16])   -> YMM1[31:16]
//                           // ...
//                           // MIN(YMM0[255:240], YMM2[255:240]) -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPMINSW, 
//                           // MIN(XMM0[15:0],    XMM2[15:0])    -> XMM1[15:0]    
//                           // MIN(XMM0[31:16],   XMM2[31:16])   -> XMM1[31:16]
//                           // ...
//                           // MIN(XMM0[127:112], XMM2[127:112]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpminuwcomma ( VPMINUW, )
//
// C prototype:
//  void dg_forthvpminuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMINUW instruction. This sequence compares each unsigned 16 bit
//   integer from target y with each corresponding unsigned 16 bit integer from  
//   the source and copies the smallest to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMINUW,    
//                           // MIN(XMM0[15:0],    [RBX][15:0])    -> XMM1[15:0]    
//                           // MIN(XMM0[31:16],   [RBX][31:16])   -> XMM1[31:16]
//                           // ...
//                           // MIN(XMM0[127:112], [RBX][127:112]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMINUW,       
//                           // MIN(YMM0[15:0],    YMM2[15:0])    -> YMM1[15:0]    
//                           // MIN(YMM0[31:16],   YMM2[31:16])   -> YMM1[31:16]
//                           // ...
//                           // MIN(YMM0[255:240], YMM2[255:240]) -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPMINUW, 
//                           // MIN(XMM0[15:0],    XMM2[15:0])    -> XMM1[15:0]    
//                           // MIN(XMM0[31:16],   XMM2[31:16])   -> XMM1[31:16]
//                           // ...
//                           // MIN(XMM0[127:112], XMM2[127:112]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaxsbcomma ( PMAXSB, )
//
// C prototype:
//  void dg_forthpmaxsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMAXSB instruction. This sequence compares each signed byte of the
//   source with each signed byte in the destination and if the source value
//   is greater, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMAXSB,     // if [RBX][7:0] > XMM1[7:0] then
//                                 [RBX][7:0] -> XMM1[7:0]
//                             // if [RBX][15:8] > XMM1[15:8] then
//                                 [RBX][15:8] -> XMM1[15:8]
//                             // if [RBX][23:16] > XMM1[23:16] then
//                                 [RBX][23:16] -> XMM1[23:16]
//                             // if [RBX][31:24] > XMM1[31:24] then
//                                 [RBX][31:24] -> XMM1[31:24]
//                             // if [RBX][39:32] > XMM1[39:32] then
//                                 [RBX][39:32] -> XMM1[39:32]
//                             // if [RBX][47:40] > XMM1[47:40] then
//                                 [RBX][47:40] -> XMM1[47:40]
//                             // if [RBX][55:48] > XMM1[55:48] then
//                                 [RBX][55:48] -> XMM1[55:48]
//                             // if [RBX][63:56] > XMM1[63:56] then
//                                 [RBX][63:56] -> XMM1[63:56]
//                             // if [RBX][71:64] > XMM1[71:64] then
//                                 [RBX][71:64] -> XMM1[71:64]
//                             // if [RBX][79:72] > XMM1[79:72] then
//                                 [RBX][79:72] -> XMM1[79:72]
//                             // if [RBX][87:80] > XMM1[87:80] then
//                                 [RBX][87:80] -> XMM1[87:80]
//                             // if [RBX][95:88] > XMM1[95:88] then
//                                 [RBX][95:88] -> XMM1[95:88]
//                             // if [RBX][103:96] > XMM1[103:96] then
//                                 [RBX][103:96] -> XMM1[103:96]
//                             // if [RBX][111:104] > XMM1[111:104] then
//                                 [RBX][111:104] -> XMM1[111:104]
//                             // if [RBX][119:112] > XMM1[119:112] then
//                                 [RBX][119:112] -> XMM1[119:112]
//                             // if [RBX][127:120] > XMM1[127:120] then
//                                 [RBX][127:120] -> XMM1[127:120]
//
//  XMM2  XMM1  PMAXSB,        // if XMM2[7:0] > XMM1[7:0] then
//                                 XMM2[7:0] -> XMM1[7:0]
//                             // if XMM2[15:8] > XMM1[15:8] then
//                                 XMM2[15:8] -> XMM1[15:8]
//                             // if XMM2[23:16] > XMM1[23:16] then
//                                 XMM2[23:16] -> XMM1[23:16]
//                             // if XMM2[31:24] > XMM1[31:24] then
//                                 XMM2[31:24] -> XMM1[31:24]
//                             // if XMM2[39:32] > XMM1[39:32] then
//                                 XMM2[39:32] -> XMM1[39:32]
//                             // if XMM2[47:40] > XMM1[47:40] then
//                                 XMM2[47:40] -> XMM1[47:40]
//                             // if XMM2[55:48] > XMM1[55:48] then
//                                 XMM2[55:48] -> XMM1[55:48]
//                             // if XMM2[63:56] > XMM1[63:56] then
//                                 XMM2[63:56] -> XMM1[63:56]
//                             // if XMM2[71:64] > XMM1[71:64] then
//                                 XMM2[71:64] -> XMM1[71:64]
//                             // if XMM2[79:72] > XMM1[79:72] then
//                                 XMM2[79:72] -> XMM1[79:72]
//                             // if XMM2[87:80] > XMM1[87:80] then
//                                 XMM2[87:80] -> XMM1[87:80]
//                             // if XMM2[95:88] > XMM1[95:88] then
//                                 XMM2[95:88] -> XMM1[95:88]
//                             // if XMM2[103:96] > XMM1[103:96] then
//                                 XMM2[103:96] -> XMM1[103:96]
//                             // if XMM2[111:104] > XMM1[111:104] then
//                                 XMM2[111:104] -> XMM1[111:104]
//                             // if XMM2[119:112] > XMM1[119:112] then
//                                 XMM2[119:112] -> XMM1[119:112]
//                             // if XMM2[127:120] > XMM1[127:120] then
//                                 XMM2[127:120] -> XMM1[127:120]
//
//  XMM2 <-  XMM1  PMAXSB,  // if XMM1[7:0] > XMM2[7:0] then
//                                 XMM1[7:0] -> XMM2[7:0]
//                             // if XMM1[15:8] > XMM2[15:8] then
//                                 XMM1[15:8] -> XMM2[15:8]
//                             // if XMM1[23:16] > XMM2[23:16] then
//                                 XMM1[23:16] -> XMM2[23:16]
//                             // if XMM1[31:24] > XMM2[31:24] then
//                                 XMM1[31:24] -> XMM2[31:24]
//                             // if XMM1[39:32] > XMM2[39:32] then
//                                 XMM1[39:32] -> XMM2[39:32]
//                             // if XMM1[47:40] > XMM2[47:40] then
//                                 XMM1[47:40] -> XMM2[47:40]
//                             // if XMM1[55:48] > XMM2[55:48] then
//                                 XMM1[55:48] -> XMM2[55:48]
//                             // if XMM1[63:56] > XMM2[63:56] then
//                                 XMM1[63:56] -> XMM2[63:56]
//                             // if XMM1[71:64] > XMM2[71:64] then
//                                 XMM1[71:64] -> XMM2[71:64]
//                             // if XMM1[79:72] > XMM2[79:72] then
//                                 XMM1[79:72] -> XMM2[79:72]
//                             // if XMM1[87:80] > XMM2[87:80] then
//                                 XMM1[87:80] -> XMM2[87:80]
//                             // if XMM1[95:88] > XMM2[95:88] then
//                                 XMM1[95:88] -> XMM2[95:88]
//                             // if XMM1[103:96] > XMM2[103:96] then
//                                 XMM1[103:96] -> XMM2[103:96]
//                             // if XMM1[111:104] > XMM2[111:104] then
//                                 XMM1[111:104] -> XMM2[111:104]
//                             // if XMM1[119:112] > XMM2[119:112] then
//                                 XMM1[119:112] -> XMM2[119:112]
//                             // if XMM1[127:120] > XMM2[127:120] then
//                                 XMM1[127:120] -> XMM2[127:120]
//
//  XMM1  XMM8  PMAXSB,        // if XMM1[7:0] > XMM8[7:0] then
//                                 XMM1[7:0] -> XMM8[7:0]
//                             // if XMM1[15:8] > XMM8[15:8] then
//                                 XMM1[15:8] -> XMM8[15:8]
//                             // if XMM1[23:16] > XMM8[23:16] then
//                                 XMM1[23:16] -> XMM8[23:16]
//                             // if XMM1[31:24] > XMM8[31:24] then
//                                 XMM1[31:24] -> XMM8[31:24]
//                             // if XMM1[39:32] > XMM8[39:32] then
//                                 XMM1[39:32] -> XMM8[39:32]
//                             // if XMM1[47:40] > XMM8[47:40] then
//                                 XMM1[47:40] -> XMM8[47:40]
//                             // if XMM1[55:48] > XMM8[55:48] then
//                                 XMM1[55:48] -> XMM8[55:48]
//                             // if XMM1[63:56] > XMM8[63:56] then
//                                 XMM1[63:56] -> XMM8[63:56]
//                             // if XMM1[71:64] > XMM8[71:64] then
//                                 XMM1[71:64] -> XMM8[71:64]
//                             // if XMM1[79:72] > XMM8[79:72] then
//                                 XMM1[79:72] -> XMM8[79:72]
//                             // if XMM1[87:80] > XMM8[87:80] then
//                                 XMM1[87:80] -> XMM8[87:80]
//                             // if XMM1[95:88] > XMM8[95:88] then
//                                 XMM1[95:88] -> XMM8[95:88]
//                             // if XMM1[103:96] > XMM8[103:96] then
//                                 XMM1[103:96] -> XMM8[103:96]
//                             // if XMM1[111:104] > XMM8[111:104] then
//                                 XMM1[111:104] -> XMM8[111:104]
//                             // if XMM1[119:112] > XMM8[119:112] then
//                                 XMM1[119:112] -> XMM8[119:112]
//                             // if XMM1[127:120] > XMM8[127:120] then
//                                 XMM1[127:120] -> XMM8[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaxsdcomma ( PMAXSD, )
//
// C prototype:
//  void dg_forthpmaxsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMAXSD instruction. This sequence compares each signed 32 bit value
//   of the source with each signed 32 bit value in the destination and if the
//   source value is greater, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMAXSD,     // if [RBX][31:0] > XMM1[31:0] then
//                                 [RBX][31:0] -> XMM1[31:0]
//                             // if [RBX][63:32] > XMM1[63:32] then
//                                 [RBX][63:32] -> XMM1[63:32]
//                             // if [RBX][95:64] > XMM1[95:64] then
//                                 [RBX][95:64] -> XMM1[95:64]
//                             // if [RBX][127:96] > XMM1[127:96] then
//                                 [RBX][127:96] -> XMM1[127:96]
//
//  XMM2  XMM1  PMAXSD,        // if XMM2[31:0] > XMM1[31:0] then
//                                 XMM2[31:0] -> XMM1[31:0]
//                             // if XMM2[63:32] > XMM1[63:32] then
//                                 XMM2[63:32] -> XMM1[63:32]
//                             // if XMM2[95:64] > XMM1[95:64] then
//                                 XMM2[95:64] -> XMM1[95:64]
//                             // if XMM2[127:96] > XMM1[127:96] then
//                                 XMM2[127:96] -> XMM1[127:96]
//
//  XMM2 <-  XMM1  PMAXSD,  // if XMM1[31:0] > XMM2[31:0] then
//                                 XMM1[31:0] -> XMM2[31:0]
//                             // if XMM1[63:32] > XMM2[63:32] then
//                                 XMM1[63:32] -> XMM2[63:32]
//                             // if XMM1[95:64] > XMM2[95:64] then
//                                 XMM1[95:64] -> XMM2[95:64]
//                             // if XMM1[127:96] > XMM2[127:96] then
//                                 XMM1[127:96] -> XMM2[127:96]
//
//  XMM1  XMM8  PMAXSD,        // if XMM1[31:0] > XMM8[31:0] then
//                                 XMM1[31:0] -> XMM8[31:0]
//                             // if XMM1[63:32] > XMM8[63:32] then
//                                 XMM1[63:32] -> XMM8[63:32]
//                             // if XMM1[95:64] > XMM8[95:64] then
//                                 XMM1[95:64] -> XMM8[95:64]
//                             // if XMM1[127:96] > XMM8[127:96] then
//                                 XMM1[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaxswcomma ( PMAXSW, )
//
// C prototype:
//  void dg_forthpmaxswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMAXSW instruction. This sequence compares each signed 16 bit integer
//   of the source with each signed byte in the destination and if the source
//   value is greater, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMAXSW,     // if [RBX][15:0] > XMM1[15:0] then
//                                 [RBX][15:0] -> XMM1[15:0]
//                             // if [RBX][31:16] > XMM1[31:16] then
//                                 [RBX][31:16] -> XMM1[31:16]
//                             // if [RBX][47:32] > XMM1[47:32] then
//                                 [RBX][47:32] -> XMM1[47:32]
//                             // if [RBX][63:48] > XMM1[63:48] then
//                                 [RBX][63:48] -> XMM1[63:48]
//                             // if [RBX][79:64] > XMM1[79:64] then
//                                 [RBX][79:64] -> XMM1[79:64]
//                             // if [RBX][95:80] > XMM1[95:80] then
//                                 [RBX][95:80] -> XMM1[95:80]
//                             // if [RBX][111:96] > XMM1[111:96] then
//                                 [RBX][111:96] -> XMM1[111:96]
//                             // if [RBX][127:112] > XMM1[127:112] then
//                                 [RBX][127:112] -> XMM1[127:112]
//
//  RBX [R]  ST1  PMAXSW,      // if [RBX][15:0] > ST1[15:0] then
//                                 [RBX][15:0] -> ST1[15:0]
//                             // if [RBX][31:16] > ST1[31:16] then
//                                 [RBX][31:16] -> ST1[31:16]
//                             // if [RBX][47:32] > ST1[47:32] then
//                                 [RBX][47:32] -> ST1[47:32]
//                             // if [RBX][63:48] > ST1[63:48] then
//                                 [RBX][63:48] -> ST1[63:48]
//
//  XMM2  XMM1  PMAXSW,        // if XMM2[15:0] > XMM1[15:0] then
//                                 XMM2[15:0] -> XMM1[15:0]
//                             // if XMM2[31:16] > XMM1[31:16] then
//                                 XMM2[31:16] -> XMM1[31:16]
//                             // if XMM2[47:32] > XMM1[47:32] then
//                                 XMM2[47:32] -> XMM1[47:32]
//                             // if XMM2[63:48] > XMM1[63:48] then
//                                 XMM2[63:48] -> XMM1[63:48]
//                             // if XMM2[79:64] > XMM1[79:64] then
//                                 XMM2[79:64] -> XMM1[79:64]
//                             // if XMM2[95:80] > XMM1[95:80] then
//                                 XMM2[95:80] -> XMM1[95:80]
//                             // if XMM2[111:96] > XMM1[111:96] then
//                                 XMM2[111:96] -> XMM1[111:96]
//                             // if XMM2[127:112] > XMM1[127:112] then
//                                 XMM2[127:112] -> XMM1[127:112]
//
//  ST2  ST1  PMAXSW,          // if ST2[15:0] > ST1[15:0] then
//                                 ST2[15:0] -> ST1[15:0]
//                             // if ST2[31:16] > ST1[31:16] then
//                                 ST2[31:16] -> ST1[31:16]
//                             // if ST2[47:32] > ST1[47:32] then
//                                 ST2[47:32] -> ST1[47:32]
//                             // if ST2[63:48] > ST1[63:48] then
//                                 ST2[63:48] -> ST1[63:48]
//
//  XMM2 <-  XMM1  PMAXSW,  // if XMM1[15:0] > XMM2[15:0] then
//                                 XMM1[15:0] -> XMM2[15:0]
//                             // if XMM1[31:16] > XMM2[31:16] then
//                                 XMM1[31:16] -> XMM2[31:16]
//                             // if XMM1[47:32] > XMM2[47:32] then
//                                 XMM1[47:32] -> XMM2[47:32]
//                             // if XMM1[63:48] > XMM2[63:48] then
//                                 XMM1[63:48] -> XMM2[63:48]
//                             // if XMM1[79:64] > XMM2[79:64] then
//                                 XMM1[79:64] -> XMM2[79:64]
//                             // if XMM1[95:80] > XMM2[95:80] then
//                                 XMM1[95:80] -> XMM2[95:80]
//                             // if XMM1[111:96] > XMM2[111:96] then
//                                 XMM1[111:96] -> XMM2[111:96]
//                             // if XMM1[127:112] > XMM2[127:112] then
//                                 XMM1[127:112] -> XMM2[127:112]
//
//  XMM1  XMM8  PMAXSW,        // if XMM1[15:0] > XMM8[15:0] then
//                                 XMM1[15:0] -> XMM8[15:0]
//                             // if XMM1[31:16] > XMM8[31:16] then
//                                 XMM1[31:16] -> XMM8[31:16]
//                             // if XMM1[47:32] > XMM8[47:32] then
//                                 XMM1[47:32] -> XMM8[47:32]
//                             // if XMM1[63:48] > XMM8[63:48] then
//                                 XMM1[63:48] -> XMM8[63:48]
//                             // if XMM1[79:64] > XMM8[79:64] then
//                                 XMM1[79:64] -> XMM8[79:64]
//                             // if XMM1[95:80] > XMM8[95:80] then
//                                 XMM1[95:80] -> XMM8[95:80]
//                             // if XMM1[111:96] > XMM8[111:96] then
//                                 XMM1[111:96] -> XMM8[111:96]
//                             // if XMM1[127:112] > XMM8[127:112] then
//                                 XMM1[127:112] -> XMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaxubcomma ( PMAXUB, )
//
// C prototype:
//  void dg_forthpmaxubcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMAXUB instruction. This sequence compares each unsigned byte of the
//   source with each unsigned byte in the destination and if the source value
//   is greater, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMAXUB,     // if [RBX][7:0] > XMM1[7:0] then
//                                 [RBX][7:0] -> XMM1[7:0]
//                             // if [RBX][15:8] > XMM1[15:8] then
//                                 [RBX][15:8] -> XMM1[15:8]
//                             // if [RBX][23:16] > XMM1[23:16] then
//                                 [RBX][23:16] -> XMM1[23:16]
//                             // if [RBX][31:24] > XMM1[31:24] then
//                                 [RBX][31:24] -> XMM1[31:24]
//                             // if [RBX][39:32] > XMM1[39:32] then
//                                 [RBX][39:32] -> XMM1[39:32]
//                             // if [RBX][47:40] > XMM1[47:40] then
//                                 [RBX][47:40] -> XMM1[47:40]
//                             // if [RBX][55:48] > XMM1[55:48] then
//                                 [RBX][55:48] -> XMM1[55:48]
//                             // if [RBX][63:56] > XMM1[63:56] then
//                                 [RBX][63:56] -> XMM1[63:56]
//                             // if [RBX][71:64] > XMM1[71:64] then
//                                 [RBX][71:64] -> XMM1[71:64]
//                             // if [RBX][79:72] > XMM1[79:72] then
//                                 [RBX][79:72] -> XMM1[79:72]
//                             // if [RBX][87:80] > XMM1[87:80] then
//                                 [RBX][87:80] -> XMM1[87:80]
//                             // if [RBX][95:88] > XMM1[95:88] then
//                                 [RBX][95:88] -> XMM1[95:88]
//                             // if [RBX][103:96] > XMM1[103:96] then
//                                 [RBX][103:96] -> XMM1[103:96]
//                             // if [RBX][111:104] > XMM1[111:104] then
//                                 [RBX][111:104] -> XMM1[111:104]
//                             // if [RBX][119:112] > XMM1[119:112] then
//                                 [RBX][119:112] -> XMM1[119:112]
//                             // if [RBX][127:120] > XMM1[127:120] then
//                                 [RBX][127:120] -> XMM1[127:120]
//
//  RBX [R]  ST1  PMAXUB,      // if [RBX][7:0] > ST1[7:0] then
//                                 [RBX][7:0] -> ST1[7:0]
//                             // if [RBX][15:8] > ST1[15:8] then
//                                 [RBX][15:8] -> ST1[15:8]
//                             // if [RBX][23:16] > ST1[23:16] then
//                                 [RBX][23:16] -> ST1[23:16]
//                             // if [RBX][31:24] > ST1[31:24] then
//                                 [RBX][31:24] -> ST1[31:24]
//                             // if [RBX][39:32] > ST1[39:32] then
//                                 [RBX][39:32] -> ST1[39:32]
//                             // if [RBX][47:40] > ST1[47:40] then
//                                 [RBX][47:40] -> ST1[47:40]
//                             // if [RBX][55:48] > ST1[55:48] then
//                                 [RBX][55:48] -> ST1[55:48]
//                             // if [RBX][63:56] > ST1[63:56] then
//                                 [RBX][63:56] -> ST1[63:56]
//
//  XMM2  XMM1  PMAXUB,        // if XMM2[7:0] > XMM1[7:0] then
//                                 XMM2[7:0] -> XMM1[7:0]
//                             // if XMM2[15:8] > XMM1[15:8] then
//                                 XMM2[15:8] -> XMM1[15:8]
//                             // if XMM2[23:16] > XMM1[23:16] then
//                                 XMM2[23:16] -> XMM1[23:16]
//                             // if XMM2[31:24] > XMM1[31:24] then
//                                 XMM2[31:24] -> XMM1[31:24]
//                             // if XMM2[39:32] > XMM1[39:32] then
//                                 XMM2[39:32] -> XMM1[39:32]
//                             // if XMM2[47:40] > XMM1[47:40] then
//                                 XMM2[47:40] -> XMM1[47:40]
//                             // if XMM2[55:48] > XMM1[55:48] then
//                                 XMM2[55:48] -> XMM1[55:48]
//                             // if XMM2[63:56] > XMM1[63:56] then
//                                 XMM2[63:56] -> XMM1[63:56]
//                             // if XMM2[71:64] > XMM1[71:64] then
//                                 XMM2[71:64] -> XMM1[71:64]
//                             // if XMM2[79:72] > XMM1[79:72] then
//                                 XMM2[79:72] -> XMM1[79:72]
//                             // if XMM2[87:80] > XMM1[87:80] then
//                                 XMM2[87:80] -> XMM1[87:80]
//                             // if XMM2[95:88] > XMM1[95:88] then
//                                 XMM2[95:88] -> XMM1[95:88]
//                             // if XMM2[103:96] > XMM1[103:96] then
//                                 XMM2[103:96] -> XMM1[103:96]
//                             // if XMM2[111:104] > XMM1[111:104] then
//                                 XMM2[111:104] -> XMM1[111:104]
//                             // if XMM2[119:112] > XMM1[119:112] then
//                                 XMM2[119:112] -> XMM1[119:112]
//                             // if XMM2[127:120] > XMM1[127:120] then
//                                 XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PMAXUB,          // if ST2[7:0] > ST1[7:0] then
//                                 ST2[7:0] -> ST1[7:0]
//                             // if ST2[15:8] > ST1[15:8] then
//                                 ST2[15:8] -> ST1[15:8]
//                             // if ST2[23:16] > ST1[23:16] then
//                                 ST2[23:16] -> ST1[23:16]
//                             // if ST2[31:24] > ST1[31:24] then
//                                 ST2[31:24] -> ST1[31:24]
//                             // if ST2[39:32] > ST1[39:32] then
//                                 ST2[39:32] -> ST1[39:32]
//                             // if ST2[47:40] > ST1[47:40] then
//                                 ST2[47:40] -> ST1[47:40]
//                             // if ST2[55:48] > ST1[55:48] then
//                                 ST2[55:48] -> ST1[55:48]
//                             // if ST2[63:56] > ST1[63:56] then
//                                 ST2[63:56] -> ST1[63:56]
//
//  XMM2 <-  XMM1  PMAXUB,  // if XMM1[7:0] > XMM2[7:0] then
//                                 XMM1[7:0] -> XMM2[7:0]
//                             // if XMM1[15:8] > XMM2[15:8] then
//                                 XMM1[15:8] -> XMM2[15:8]
//                             // if XMM1[23:16] > XMM2[23:16] then
//                                 XMM1[23:16] -> XMM2[23:16]
//                             // if XMM1[31:24] > XMM2[31:24] then
//                                 XMM1[31:24] -> XMM2[31:24]
//                             // if XMM1[39:32] > XMM2[39:32] then
//                                 XMM1[39:32] -> XMM2[39:32]
//                             // if XMM1[47:40] > XMM2[47:40] then
//                                 XMM1[47:40] -> XMM2[47:40]
//                             // if XMM1[55:48] > XMM2[55:48] then
//                                 XMM1[55:48] -> XMM2[55:48]
//                             // if XMM1[63:56] > XMM2[63:56] then
//                                 XMM1[63:56] -> XMM2[63:56]
//                             // if XMM1[71:64] > XMM2[71:64] then
//                                 XMM1[71:64] -> XMM2[71:64]
//                             // if XMM1[79:72] > XMM2[79:72] then
//                                 XMM1[79:72] -> XMM2[79:72]
//                             // if XMM1[87:80] > XMM2[87:80] then
//                                 XMM1[87:80] -> XMM2[87:80]
//                             // if XMM1[95:88] > XMM2[95:88] then
//                                 XMM1[95:88] -> XMM2[95:88]
//                             // if XMM1[103:96] > XMM2[103:96] then
//                                 XMM1[103:96] -> XMM2[103:96]
//                             // if XMM1[111:104] > XMM2[111:104] then
//                                 XMM1[111:104] -> XMM2[111:104]
//                             // if XMM1[119:112] > XMM2[119:112] then
//                                 XMM1[119:112] -> XMM2[119:112]
//                             // if XMM1[127:120] > XMM2[127:120] then
//                                 XMM1[127:120] -> XMM2[127:120]
//
//  XMM1  XMM8  PMAXUB,        // if XMM1[7:0] > XMM8[7:0] then
//                                 XMM1[7:0] -> XMM8[7:0]
//                             // if XMM1[15:8] > XMM8[15:8] then
//                                 XMM1[15:8] -> XMM8[15:8]
//                             // if XMM1[23:16] > XMM8[23:16] then
//                                 XMM1[23:16] -> XMM8[23:16]
//                             // if XMM1[31:24] > XMM8[31:24] then
//                                 XMM1[31:24] -> XMM8[31:24]
//                             // if XMM1[39:32] > XMM8[39:32] then
//                                 XMM1[39:32] -> XMM8[39:32]
//                             // if XMM1[47:40] > XMM8[47:40] then
//                                 XMM1[47:40] -> XMM8[47:40]
//                             // if XMM1[55:48] > XMM8[55:48] then
//                                 XMM1[55:48] -> XMM8[55:48]
//                             // if XMM1[63:56] > XMM8[63:56] then
//                                 XMM1[63:56] -> XMM8[63:56]
//                             // if XMM1[71:64] > XMM8[71:64] then
//                                 XMM1[71:64] -> XMM8[71:64]
//                             // if XMM1[79:72] > XMM8[79:72] then
//                                 XMM1[79:72] -> XMM8[79:72]
//                             // if XMM1[87:80] > XMM8[87:80] then
//                                 XMM1[87:80] -> XMM8[87:80]
//                             // if XMM1[95:88] > XMM8[95:88] then
//                                 XMM1[95:88] -> XMM8[95:88]
//                             // if XMM1[103:96] > XMM8[103:96] then
//                                 XMM1[103:96] -> XMM8[103:96]
//                             // if XMM1[111:104] > XMM8[111:104] then
//                                 XMM1[111:104] -> XMM8[111:104]
//                             // if XMM1[119:112] > XMM8[119:112] then
//                                 XMM1[119:112] -> XMM8[119:112]
//                             // if XMM1[127:120] > XMM8[127:120] then
//                                 XMM1[127:120] -> XMM8[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaxudcomma ( PMAXUD, )
//
// C prototype:
//  void dg_forthpmaxudcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMAXUD instruction. This sequence compares each unsigned 32 bit value
//   of the source with each unsigned 32 bit value in the destination and if the
//   source value is greater, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMAXUD,     // if [RBX][31:0] > XMM1[31:0] then
//                                 [RBX][31:0] -> XMM1[31:0]
//                             // if [RBX][63:32] > XMM1[63:32] then
//                                 [RBX][63:32] -> XMM1[63:32]
//                             // if [RBX][95:64] > XMM1[95:64] then
//                                 [RBX][95:64] -> XMM1[95:64]
//                             // if [RBX][127:96] > XMM1[127:96] then
//                                 [RBX][127:96] -> XMM1[127:96]
//
//  XMM2  XMM1  PMAXUD,        // if XMM2[31:0] > XMM1[31:0] then
//                                 XMM2[31:0] -> XMM1[31:0]
//                             // if XMM2[63:32] > XMM1[63:32] then
//                                 XMM2[63:32] -> XMM1[63:32]
//                             // if XMM2[95:64] > XMM1[95:64] then
//                                 XMM2[95:64] -> XMM1[95:64]
//                             // if XMM2[127:96] > XMM1[127:96] then
//                                 XMM2[127:96] -> XMM1[127:96]
//
//  XMM2 <-  XMM1  PMAXUD,  // if XMM1[31:0] > XMM2[31:0] then
//                                 XMM1[31:0] -> XMM2[31:0]
//                             // if XMM1[63:32] > XMM2[63:32] then
//                                 XMM1[63:32] -> XMM2[63:32]
//                             // if XMM1[95:64] > XMM2[95:64] then
//                                 XMM1[95:64] -> XMM2[95:64]
//                             // if XMM1[127:96] > XMM2[127:96] then
//                                 XMM1[127:96] -> XMM2[127:96]
//
//  XMM1  XMM8  PMAXUD,        // if XMM1[31:0] > XMM8[31:0] then
//                                 XMM1[31:0] -> XMM8[31:0]
//                             // if XMM1[63:32] > XMM8[63:32] then
//                                 XMM1[63:32] -> XMM8[63:32]
//                             // if XMM1[95:64] > XMM8[95:64] then
//                                 XMM1[95:64] -> XMM8[95:64]
//                             // if XMM1[127:96] > XMM8[127:96] then
//                                 XMM1[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaxuwcomma ( PMAXUW, )
//
// C prototype:
//  void dg_forthpmaxuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMAXUW instruction. This sequence compares each unsigned 16 bit integer
//   of the source with each unsigned byte in the destination and if the source
//   value is greater, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMAXUW,     // if [RBX][15:0] > XMM1[15:0] then
//                                 [RBX][15:0] -> XMM1[15:0]
//                             // if [RBX][31:16] > XMM1[31:16] then
//                                 [RBX][31:16] -> XMM1[31:16]
//                             // if [RBX][47:32] > XMM1[47:32] then
//                                 [RBX][47:32] -> XMM1[47:32]
//                             // if [RBX][63:48] > XMM1[63:48] then
//                                 [RBX][63:48] -> XMM1[63:48]
//                             // if [RBX][79:64] > XMM1[79:64] then
//                                 [RBX][79:64] -> XMM1[79:64]
//                             // if [RBX][95:80] > XMM1[95:80] then
//                                 [RBX][95:80] -> XMM1[95:80]
//                             // if [RBX][111:96] > XMM1[111:96] then
//                                 [RBX][111:96] -> XMM1[111:96]
//                             // if [RBX][127:112] > XMM1[127:112] then
//                                 [RBX][127:112] -> XMM1[127:112]
//
//  XMM2  XMM1  PMAXUW,        // if XMM2[15:0] > XMM1[15:0] then
//                                 XMM2[15:0] -> XMM1[15:0]
//                             // if XMM2[31:16] > XMM1[31:16] then
//                                 XMM2[31:16] -> XMM1[31:16]
//                             // if XMM2[47:32] > XMM1[47:32] then
//                                 XMM2[47:32] -> XMM1[47:32]
//                             // if XMM2[63:48] > XMM1[63:48] then
//                                 XMM2[63:48] -> XMM1[63:48]
//                             // if XMM2[79:64] > XMM1[79:64] then
//                                 XMM2[79:64] -> XMM1[79:64]
//                             // if XMM2[95:80] > XMM1[95:80] then
//                                 XMM2[95:80] -> XMM1[95:80]
//                             // if XMM2[111:96] > XMM1[111:96] then
//                                 XMM2[111:96] -> XMM1[111:96]
//                             // if XMM2[127:112] > XMM1[127:112] then
//                                 XMM2[127:112] -> XMM1[127:112]
//
//  XMM2 <-  XMM1  PMAXUW,  // if XMM1[15:0] > XMM2[15:0] then
//                                 XMM1[15:0] -> XMM2[15:0]
//                             // if XMM1[31:16] > XMM2[31:16] then
//                                 XMM1[31:16] -> XMM2[31:16]
//                             // if XMM1[47:32] > XMM2[47:32] then
//                                 XMM1[47:32] -> XMM2[47:32]
//                             // if XMM1[63:48] > XMM2[63:48] then
//                                 XMM1[63:48] -> XMM2[63:48]
//                             // if XMM1[79:64] > XMM2[79:64] then
//                                 XMM1[79:64] -> XMM2[79:64]
//                             // if XMM1[95:80] > XMM2[95:80] then
//                                 XMM1[95:80] -> XMM2[95:80]
//                             // if XMM1[111:96] > XMM2[111:96] then
//                                 XMM1[111:96] -> XMM2[111:96]
//                             // if XMM1[127:112] > XMM2[127:112] then
//                                 XMM1[127:112] -> XMM2[127:112]
//
//  XMM1  XMM8  PMAXUW,        // if XMM1[15:0] > XMM8[15:0] then
//                                 XMM1[15:0] -> XMM8[15:0]
//                             // if XMM1[31:16] > XMM8[31:16] then
//                                 XMM1[31:16] -> XMM8[31:16]
//                             // if XMM1[47:32] > XMM8[47:32] then
//                                 XMM1[47:32] -> XMM8[47:32]
//                             // if XMM1[63:48] > XMM8[63:48] then
//                                 XMM1[63:48] -> XMM8[63:48]
//                             // if XMM1[79:64] > XMM8[79:64] then
//                                 XMM1[79:64] -> XMM8[79:64]
//                             // if XMM1[95:80] > XMM8[95:80] then
//                                 XMM1[95:80] -> XMM8[95:80]
//                             // if XMM1[111:96] > XMM8[111:96] then
//                                 XMM1[111:96] -> XMM8[111:96]
//                             // if XMM1[127:112] > XMM8[127:112] then
//                                 XMM1[127:112] -> XMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpminsbcomma ( PMINSB, )
//
// C prototype:
//  void dg_forthpminsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMINSB instruction. This sequence compares each signed byte of the
//   source with each signed byte in the destination and if the source value
//   is less, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMINSB,     // if [RBX][7:0] < XMM1[7:0] then
//                                 [RBX][7:0] -> XMM1[7:0]
//                             // if [RBX][15:8] < XMM1[15:8] then
//                                 [RBX][15:8] -> XMM1[15:8]
//                             // if [RBX][23:16] < XMM1[23:16] then
//                                 [RBX][23:16] -> XMM1[23:16]
//                             // if [RBX][31:24] < XMM1[31:24] then
//                                 [RBX][31:24] -> XMM1[31:24]
//                             // if [RBX][39:32] < XMM1[39:32] then
//                                 [RBX][39:32] -> XMM1[39:32]
//                             // if [RBX][47:40] < XMM1[47:40] then
//                                 [RBX][47:40] -> XMM1[47:40]
//                             // if [RBX][55:48] < XMM1[55:48] then
//                                 [RBX][55:48] -> XMM1[55:48]
//                             // if [RBX][63:56] < XMM1[63:56] then
//                                 [RBX][63:56] -> XMM1[63:56]
//                             // if [RBX][71:64] < XMM1[71:64] then
//                                 [RBX][71:64] -> XMM1[71:64]
//                             // if [RBX][79:72] < XMM1[79:72] then
//                                 [RBX][79:72] -> XMM1[79:72]
//                             // if [RBX][87:80] < XMM1[87:80] then
//                                 [RBX][87:80] -> XMM1[87:80]
//                             // if [RBX][95:88] < XMM1[95:88] then
//                                 [RBX][95:88] -> XMM1[95:88]
//                             // if [RBX][103:96] < XMM1[103:96] then
//                                 [RBX][103:96] -> XMM1[103:96]
//                             // if [RBX][111:104] < XMM1[111:104] then
//                                 [RBX][111:104] -> XMM1[111:104]
//                             // if [RBX][119:112] < XMM1[119:112] then
//                                 [RBX][119:112] -> XMM1[119:112]
//                             // if [RBX][127:120] < XMM1[127:120] then
//                                 [RBX][127:120] -> XMM1[127:120]
//
//  XMM2  XMM1  PMINSB,        // if XMM2[7:0] < XMM1[7:0] then
//                                 XMM2[7:0] -> XMM1[7:0]
//                             // if XMM2[15:8] < XMM1[15:8] then
//                                 XMM2[15:8] -> XMM1[15:8]
//                             // if XMM2[23:16] < XMM1[23:16] then
//                                 XMM2[23:16] -> XMM1[23:16]
//                             // if XMM2[31:24] < XMM1[31:24] then
//                                 XMM2[31:24] -> XMM1[31:24]
//                             // if XMM2[39:32] < XMM1[39:32] then
//                                 XMM2[39:32] -> XMM1[39:32]
//                             // if XMM2[47:40] < XMM1[47:40] then
//                                 XMM2[47:40] -> XMM1[47:40]
//                             // if XMM2[55:48] < XMM1[55:48] then
//                                 XMM2[55:48] -> XMM1[55:48]
//                             // if XMM2[63:56] < XMM1[63:56] then
//                                 XMM2[63:56] -> XMM1[63:56]
//                             // if XMM2[71:64] < XMM1[71:64] then
//                                 XMM2[71:64] -> XMM1[71:64]
//                             // if XMM2[79:72] < XMM1[79:72] then
//                                 XMM2[79:72] -> XMM1[79:72]
//                             // if XMM2[87:80] < XMM1[87:80] then
//                                 XMM2[87:80] -> XMM1[87:80]
//                             // if XMM2[95:88] < XMM1[95:88] then
//                                 XMM2[95:88] -> XMM1[95:88]
//                             // if XMM2[103:96] < XMM1[103:96] then
//                                 XMM2[103:96] -> XMM1[103:96]
//                             // if XMM2[111:104] < XMM1[111:104] then
//                                 XMM2[111:104] -> XMM1[111:104]
//                             // if XMM2[119:112] < XMM1[119:112] then
//                                 XMM2[119:112] -> XMM1[119:112]
//                             // if XMM2[127:120] < XMM1[127:120] then
//                                 XMM2[127:120] -> XMM1[127:120]
//
//  XMM2 <-  XMM1  PMINSB,  // if XMM1[7:0] < XMM2[7:0] then
//                                 XMM1[7:0] -> XMM2[7:0]
//                             // if XMM1[15:8] < XMM2[15:8] then
//                                 XMM1[15:8] -> XMM2[15:8]
//                             // if XMM1[23:16] < XMM2[23:16] then
//                                 XMM1[23:16] -> XMM2[23:16]
//                             // if XMM1[31:24] < XMM2[31:24] then
//                                 XMM1[31:24] -> XMM2[31:24]
//                             // if XMM1[39:32] < XMM2[39:32] then
//                                 XMM1[39:32] -> XMM2[39:32]
//                             // if XMM1[47:40] < XMM2[47:40] then
//                                 XMM1[47:40] -> XMM2[47:40]
//                             // if XMM1[55:48] < XMM2[55:48] then
//                                 XMM1[55:48] -> XMM2[55:48]
//                             // if XMM1[63:56] < XMM2[63:56] then
//                                 XMM1[63:56] -> XMM2[63:56]
//                             // if XMM1[71:64] < XMM2[71:64] then
//                                 XMM1[71:64] -> XMM2[71:64]
//                             // if XMM1[79:72] < XMM2[79:72] then
//                                 XMM1[79:72] -> XMM2[79:72]
//                             // if XMM1[87:80] < XMM2[87:80] then
//                                 XMM1[87:80] -> XMM2[87:80]
//                             // if XMM1[95:88] < XMM2[95:88] then
//                                 XMM1[95:88] -> XMM2[95:88]
//                             // if XMM1[103:96] < XMM2[103:96] then
//                                 XMM1[103:96] -> XMM2[103:96]
//                             // if XMM1[111:104] < XMM2[111:104] then
//                                 XMM1[111:104] -> XMM2[111:104]
//                             // if XMM1[119:112] < XMM2[119:112] then
//                                 XMM1[119:112] -> XMM2[119:112]
//                             // if XMM1[127:120] < XMM2[127:120] then
//                                 XMM1[127:120] -> XMM2[127:120]
//
//  XMM1  XMM8  PMINSB,        // if XMM1[7:0] < XMM8[7:0] then
//                                 XMM1[7:0] -> XMM8[7:0]
//                             // if XMM1[15:8] < XMM8[15:8] then
//                                 XMM1[15:8] -> XMM8[15:8]
//                             // if XMM1[23:16] < XMM8[23:16] then
//                                 XMM1[23:16] -> XMM8[23:16]
//                             // if XMM1[31:24] < XMM8[31:24] then
//                                 XMM1[31:24] -> XMM8[31:24]
//                             // if XMM1[39:32] < XMM8[39:32] then
//                                 XMM1[39:32] -> XMM8[39:32]
//                             // if XMM1[47:40] < XMM8[47:40] then
//                                 XMM1[47:40] -> XMM8[47:40]
//                             // if XMM1[55:48] < XMM8[55:48] then
//                                 XMM1[55:48] -> XMM8[55:48]
//                             // if XMM1[63:56] < XMM8[63:56] then
//                                 XMM1[63:56] -> XMM8[63:56]
//                             // if XMM1[71:64] < XMM8[71:64] then
//                                 XMM1[71:64] -> XMM8[71:64]
//                             // if XMM1[79:72] < XMM8[79:72] then
//                                 XMM1[79:72] -> XMM8[79:72]
//                             // if XMM1[87:80] < XMM8[87:80] then
//                                 XMM1[87:80] -> XMM8[87:80]
//                             // if XMM1[95:88] < XMM8[95:88] then
//                                 XMM1[95:88] -> XMM8[95:88]
//                             // if XMM1[103:96] < XMM8[103:96] then
//                                 XMM1[103:96] -> XMM8[103:96]
//                             // if XMM1[111:104] < XMM8[111:104] then
//                                 XMM1[111:104] -> XMM8[111:104]
//                             // if XMM1[119:112] < XMM8[119:112] then
//                                 XMM1[119:112] -> XMM8[119:112]
//                             // if XMM1[127:120] < XMM8[127:120] then
//                                 XMM1[127:120] -> XMM8[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpminsdcomma ( PMINSD, )
//
// C prototype:
//  void dg_forthpminsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMINSD instruction. This sequence compares each signed 32 bit value
//   of the source with each signed 32 bit value in the destination and if the
//   source value is less, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMINSD,     // if [RBX][31:0] < XMM1[31:0] then
//                                 [RBX][31:0] -> XMM1[31:0]
//                             // if [RBX][63:32] < XMM1[63:32] then
//                                 [RBX][63:32] -> XMM1[63:32]
//                             // if [RBX][95:64] < XMM1[95:64] then
//                                 [RBX][95:64] -> XMM1[95:64]
//                             // if [RBX][127:96] < XMM1[127:96] then
//                                 [RBX][127:96] -> XMM1[127:96]
//
//  XMM2  XMM1  PMINSD,        // if XMM2[31:0] < XMM1[31:0] then
//                                 XMM2[31:0] -> XMM1[31:0]
//                             // if XMM2[63:32] < XMM1[63:32] then
//                                 XMM2[63:32] -> XMM1[63:32]
//                             // if XMM2[95:64] < XMM1[95:64] then
//                                 XMM2[95:64] -> XMM1[95:64]
//                             // if XMM2[127:96] < XMM1[127:96] then
//                                 XMM2[127:96] -> XMM1[127:96]
//
//  XMM2 <-  XMM1  PMINSD,  // if XMM1[31:0] < XMM2[31:0] then
//                                 XMM1[31:0] -> XMM2[31:0]
//                             // if XMM1[63:32] < XMM2[63:32] then
//                                 XMM1[63:32] -> XMM2[63:32]
//                             // if XMM1[95:64] < XMM2[95:64] then
//                                 XMM1[95:64] -> XMM2[95:64]
//                             // if XMM1[127:96] < XMM2[127:96] then
//                                 XMM1[127:96] -> XMM2[127:96]
//
//  XMM1  XMM8  PMINSD,        // if XMM1[31:0] < XMM8[31:0] then
//                                 XMM1[31:0] -> XMM8[31:0]
//                             // if XMM1[63:32] < XMM8[63:32] then
//                                 XMM1[63:32] -> XMM8[63:32]
//                             // if XMM1[95:64] < XMM8[95:64] then
//                                 XMM1[95:64] -> XMM8[95:64]
//                             // if XMM1[127:96] < XMM8[127:96] then
//                                 XMM1[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpminswcomma ( PMINSW, )
//
// C prototype:
//  void dg_forthpminswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMINSW instruction. This sequence compares each signed 16 bit integer
//   of the source with each signed byte in the destination and if the source
//   value is less, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMINSW,     // if [RBX][15:0] < XMM1[15:0] then
//                                 [RBX][15:0] -> XMM1[15:0]
//                             // if [RBX][31:16] < XMM1[31:16] then
//                                 [RBX][31:16] -> XMM1[31:16]
//                             // if [RBX][47:32] < XMM1[47:32] then
//                                 [RBX][47:32] -> XMM1[47:32]
//                             // if [RBX][63:48] < XMM1[63:48] then
//                                 [RBX][63:48] -> XMM1[63:48]
//                             // if [RBX][79:64] < XMM1[79:64] then
//                                 [RBX][79:64] -> XMM1[79:64]
//                             // if [RBX][95:80] < XMM1[95:80] then
//                                 [RBX][95:80] -> XMM1[95:80]
//                             // if [RBX][111:96] < XMM1[111:96] then
//                                 [RBX][111:96] -> XMM1[111:96]
//                             // if [RBX][127:112] < XMM1[127:112] then
//                                 [RBX][127:112] -> XMM1[127:112]
//
//  RBX [R]  ST1  PMINSW,      // if [RBX][15:0] < ST1[15:0] then
//                                 [RBX][15:0] -> ST1[15:0]
//                             // if [RBX][31:16] < ST1[31:16] then
//                                 [RBX][31:16] -> ST1[31:16]
//                             // if [RBX][47:32] < ST1[47:32] then
//                                 [RBX][47:32] -> ST1[47:32]
//                             // if [RBX][63:48] < ST1[63:48] then
//                                 [RBX][63:48] -> ST1[63:48]
//
//  XMM2  XMM1  PMINSW,        // if XMM2[15:0] < XMM1[15:0] then
//                                 XMM2[15:0] -> XMM1[15:0]
//                             // if XMM2[31:16] < XMM1[31:16] then
//                                 XMM2[31:16] -> XMM1[31:16]
//                             // if XMM2[47:32] < XMM1[47:32] then
//                                 XMM2[47:32] -> XMM1[47:32]
//                             // if XMM2[63:48] < XMM1[63:48] then
//                                 XMM2[63:48] -> XMM1[63:48]
//                             // if XMM2[79:64] < XMM1[79:64] then
//                                 XMM2[79:64] -> XMM1[79:64]
//                             // if XMM2[95:80] < XMM1[95:80] then
//                                 XMM2[95:80] -> XMM1[95:80]
//                             // if XMM2[111:96] < XMM1[111:96] then
//                                 XMM2[111:96] -> XMM1[111:96]
//                             // if XMM2[127:112] < XMM1[127:112] then
//                                 XMM2[127:112] -> XMM1[127:112]
//
//  ST2  ST1  PMINSW,          // if ST2[15:0] < ST1[15:0] then
//                                 ST2[15:0] -> ST1[15:0]
//                             // if ST2[31:16] < ST1[31:16] then
//                                 ST2[31:16] -> ST1[31:16]
//                             // if ST2[47:32] < ST1[47:32] then
//                                 ST2[47:32] -> ST1[47:32]
//                             // if ST2[63:48] < ST1[63:48] then
//                                 ST2[63:48] -> ST1[63:48]
//
//  XMM2 <-  XMM1  PMINSW,  // if XMM1[15:0] < XMM2[15:0] then
//                                 XMM1[15:0] -> XMM2[15:0]
//                             // if XMM1[31:16] < XMM2[31:16] then
//                                 XMM1[31:16] -> XMM2[31:16]
//                             // if XMM1[47:32] < XMM2[47:32] then
//                                 XMM1[47:32] -> XMM2[47:32]
//                             // if XMM1[63:48] < XMM2[63:48] then
//                                 XMM1[63:48] -> XMM2[63:48]
//                             // if XMM1[79:64] < XMM2[79:64] then
//                                 XMM1[79:64] -> XMM2[79:64]
//                             // if XMM1[95:80] < XMM2[95:80] then
//                                 XMM1[95:80] -> XMM2[95:80]
//                             // if XMM1[111:96] < XMM2[111:96] then
//                                 XMM1[111:96] -> XMM2[111:96]
//                             // if XMM1[127:112] < XMM2[127:112] then
//                                 XMM1[127:112] -> XMM2[127:112]
//
//  XMM1  XMM8  PMINSW,        // if XMM1[15:0] < XMM8[15:0] then
//                                 XMM1[15:0] -> XMM8[15:0]
//                             // if XMM1[31:16] < XMM8[31:16] then
//                                 XMM1[31:16] -> XMM8[31:16]
//                             // if XMM1[47:32] < XMM8[47:32] then
//                                 XMM1[47:32] -> XMM8[47:32]
//                             // if XMM1[63:48] < XMM8[63:48] then
//                                 XMM1[63:48] -> XMM8[63:48]
//                             // if XMM1[79:64] < XMM8[79:64] then
//                                 XMM1[79:64] -> XMM8[79:64]
//                             // if XMM1[95:80] < XMM8[95:80] then
//                                 XMM1[95:80] -> XMM8[95:80]
//                             // if XMM1[111:96] < XMM8[111:96] then
//                                 XMM1[111:96] -> XMM8[111:96]
//                             // if XMM1[127:112] < XMM8[127:112] then
//                                 XMM1[127:112] -> XMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////
 //
 // dg_forthpminubcomma ( PMINUB, )
 //
 // C prototype:
 //  void dg_forthpminubcomma (Bufferhandle* pBHarrayhead)
 //
 // Inputs:
 //  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
 //                                         which is used as the bufferhandle for
 //                                         the array where the other bufferhandles
 //                                         are stored.
 //
 // Stack action shorthand:
 //
 //  ( targetxparameterlist targetyparameterlist -- )
 //
 // Data stack in:
 //
 //  targetxparameterlist
 //  targetyparameterlist
 //
 //
 //  The parameter list for these targets can contain these addressing mode
 //   specifiers:
 //
 //   floatingpointregister
 //   targetxmmregister
 //   baseregister [R]
 //   baseregister displacement [R+N]
 //   absoluteaddress [N]
 //   baseregister scale indexregister displacement [R+S*R+N]
 //
 //  In 64 bit addressing mode, the parameter list for a target can also
 //   contain this specifier:
 //
 //   currentcompilebufferoffset [O]
 //
 //  If you want more control over how the instruction is encoded,
 //   you can use these addressing mode specifiers instead:
 //
 //   floatingpointregister FPSR
 //   targetxmmregister XMMR
 //   baseregister displacement minimumdisplacementsize [MOD]
 //   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
 //
 //  If you want to set the direction, you can use these:
 //   ->
 //   <-
 //
 //  Description of target parameters:
 //
 //   floatingpointregister        one of:
 //                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
 //   targetxmmregister            in both 32 and 64 bit address mode, one of:
 //                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
 //                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
 //   baseregister                 one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   displacement                 signed 32 bit value (even in 64BIT mode)
 //   absoluteaddress              signed 32 bit value (even in 64BIT mode)
 //   currentcompilebufferoffset   0 based offset in bytes from start of current
 //                                 compile buffer
 //   scale                        index register is multiplied by the scale,
 //                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
 //   indexregister                one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   [R]                          specifies a memory target at the address in
 //                                 the baseregister
 //   [R+N]                        specifies a memory target at the address at
 //                                 the value in the base register plus the
 //                                 signed displacement
 //   [N]                          specifies a memory target at an
 //                                 absoluteaddress which can't be larger than
 //                                 a signed 32 bit integer. This makes [N] not
 //                                 very useful in 64 bit mode
 //   [O]                          specifies a memory target at an offset in the
 //                                 current compile buffer.
 //   [R+S*R+N]                    specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //
 //   ->                        sets the direction to forward.
 //                                 This is the default value.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   <-                        sets the direction to reverse.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   FPSR                         specifies a floating point stack register
 //                                 FPSR is optional.
 //   XMMR                         specifies an xmmr register target.
 //                                 XMMR is optional.
 //   [MOD]                        specifies a memory target at the address of
 //                                 baseregister plus displacement using modr/m
 //                                 encoding. The encoding may be promoted to
 //                                 sib if modr/m does not support it.
 //   [SIB]                        specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //                                 sib encoding will be used.
 //
 // Data stack out:
 //  none
 //
 // Execute state action:
 //  Pulls two targets from the data stack and compiles the opcode sequence for
 //   an x86 PMINUB instruction. This sequence compares each unsigned byte of the
 //   source with each unsigned byte in the destination and if the source value
 //   is less, it is copied to the destination.
 //
 // Compile state action:
 //  Compiles a call to a subroutine that does the execute state action.
 //
 // Examples:
 //  RBX [R]  XMM1  PMINUB,     // if [RBX][7:0] < XMM1[7:0] then
 //                                 [RBX][7:0] -> XMM1[7:0]
 //                             // if [RBX][15:8] < XMM1[15:8] then
 //                                 [RBX][15:8] -> XMM1[15:8]
 //                             // if [RBX][23:16] < XMM1[23:16] then
 //                                 [RBX][23:16] -> XMM1[23:16]
 //                             // if [RBX][31:24] < XMM1[31:24] then
 //                                 [RBX][31:24] -> XMM1[31:24]
 //                             // if [RBX][39:32] < XMM1[39:32] then
 //                                 [RBX][39:32] -> XMM1[39:32]
 //                             // if [RBX][47:40] < XMM1[47:40] then
 //                                 [RBX][47:40] -> XMM1[47:40]
 //                             // if [RBX][55:48] < XMM1[55:48] then
 //                                 [RBX][55:48] -> XMM1[55:48]
 //                             // if [RBX][63:56] < XMM1[63:56] then
 //                                 [RBX][63:56] -> XMM1[63:56]
 //                             // if [RBX][71:64] < XMM1[71:64] then
 //                                 [RBX][71:64] -> XMM1[71:64]
 //                             // if [RBX][79:72] < XMM1[79:72] then
 //                                 [RBX][79:72] -> XMM1[79:72]
 //                             // if [RBX][87:80] < XMM1[87:80] then
 //                                 [RBX][87:80] -> XMM1[87:80]
 //                             // if [RBX][95:88] < XMM1[95:88] then
 //                                 [RBX][95:88] -> XMM1[95:88]
 //                             // if [RBX][103:96] < XMM1[103:96] then
 //                                 [RBX][103:96] -> XMM1[103:96]
 //                             // if [RBX][111:104] < XMM1[111:104] then
 //                                 [RBX][111:104] -> XMM1[111:104]
 //                             // if [RBX][119:112] < XMM1[119:112] then
 //                                 [RBX][119:112] -> XMM1[119:112]
 //                             // if [RBX][127:120] < XMM1[127:120] then
 //                                 [RBX][127:120] -> XMM1[127:120]
 //
 //  RBX [R]  ST1  PMINUB,      // if [RBX][7:0] < ST1[7:0] then
 //                                 [RBX][7:0] -> ST1[7:0]
 //                             // if [RBX][15:8] < ST1[15:8] then
 //                                 [RBX][15:8] -> ST1[15:8]
 //                             // if [RBX][23:16] < ST1[23:16] then
 //                                 [RBX][23:16] -> ST1[23:16]
 //                             // if [RBX][31:24] < ST1[31:24] then
 //                                 [RBX][31:24] -> ST1[31:24]
 //                             // if [RBX][39:32] < ST1[39:32] then
 //                                 [RBX][39:32] -> ST1[39:32]
 //                             // if [RBX][47:40] < ST1[47:40] then
 //                                 [RBX][47:40] -> ST1[47:40]
 //                             // if [RBX][55:48] < ST1[55:48] then
 //                                 [RBX][55:48] -> ST1[55:48]
 //                             // if [RBX][63:56] < ST1[63:56] then
 //                                 [RBX][63:56] -> ST1[63:56]
 //
 //  XMM2  XMM1  PMINUB,        // if XMM2[7:0] < XMM1[7:0] then
 //                                 XMM2[7:0] -> XMM1[7:0]
 //                             // if XMM2[15:8] < XMM1[15:8] then
 //                                 XMM2[15:8] -> XMM1[15:8]
 //                             // if XMM2[23:16] < XMM1[23:16] then
 //                                 XMM2[23:16] -> XMM1[23:16]
 //                             // if XMM2[31:24] < XMM1[31:24] then
 //                                 XMM2[31:24] -> XMM1[31:24]
 //                             // if XMM2[39:32] < XMM1[39:32] then
 //                                 XMM2[39:32] -> XMM1[39:32]
 //                             // if XMM2[47:40] < XMM1[47:40] then
 //                                 XMM2[47:40] -> XMM1[47:40]
 //                             // if XMM2[55:48] < XMM1[55:48] then
 //                                 XMM2[55:48] -> XMM1[55:48]
 //                             // if XMM2[63:56] < XMM1[63:56] then
 //                                 XMM2[63:56] -> XMM1[63:56]
 //                             // if XMM2[71:64] < XMM1[71:64] then
 //                                 XMM2[71:64] -> XMM1[71:64]
 //                             // if XMM2[79:72] < XMM1[79:72] then
 //                                 XMM2[79:72] -> XMM1[79:72]
 //                             // if XMM2[87:80] < XMM1[87:80] then
 //                                 XMM2[87:80] -> XMM1[87:80]
 //                             // if XMM2[95:88] < XMM1[95:88] then
 //                                 XMM2[95:88] -> XMM1[95:88]
 //                             // if XMM2[103:96] < XMM1[103:96] then
 //                                 XMM2[103:96] -> XMM1[103:96]
 //                             // if XMM2[111:104] < XMM1[111:104] then
 //                                 XMM2[111:104] -> XMM1[111:104]
 //                             // if XMM2[119:112] < XMM1[119:112] then
 //                                 XMM2[119:112] -> XMM1[119:112]
 //                             // if XMM2[127:120] < XMM1[127:120] then
 //                                 XMM2[127:120] -> XMM1[127:120]
 //
 //  ST2  ST1  PMINUB,          // if ST2[7:0] < ST1[7:0] then
 //                                 ST2[7:0] -> ST1[7:0]
 //                             // if ST2[15:8] < ST1[15:8] then
 //                                 ST2[15:8] -> ST1[15:8]
 //                             // if ST2[23:16] < ST1[23:16] then
 //                                 ST2[23:16] -> ST1[23:16]
 //                             // if ST2[31:24] < ST1[31:24] then
 //                                 ST2[31:24] -> ST1[31:24]
 //                             // if ST2[39:32] < ST1[39:32] then
 //                                 ST2[39:32] -> ST1[39:32]
 //                             // if ST2[47:40] < ST1[47:40] then
 //                                 ST2[47:40] -> ST1[47:40]
 //                             // if ST2[55:48] < ST1[55:48] then
 //                                 ST2[55:48] -> ST1[55:48]
 //                             // if ST2[63:56] < ST1[63:56] then
 //                                 ST2[63:56] -> ST1[63:56]
 //
 //  XMM2 <-  XMM1  PMINUB,  // if XMM1[7:0] < XMM2[7:0] then
 //                                 XMM1[7:0] -> XMM2[7:0]
 //                             // if XMM1[15:8] < XMM2[15:8] then
 //                                 XMM1[15:8] -> XMM2[15:8]
 //                             // if XMM1[23:16] < XMM2[23:16] then
 //                                 XMM1[23:16] -> XMM2[23:16]
 //                             // if XMM1[31:24] < XMM2[31:24] then
 //                                 XMM1[31:24] -> XMM2[31:24]
 //                             // if XMM1[39:32] < XMM2[39:32] then
 //                                 XMM1[39:32] -> XMM2[39:32]
 //                             // if XMM1[47:40] < XMM2[47:40] then
 //                                 XMM1[47:40] -> XMM2[47:40]
 //                             // if XMM1[55:48] < XMM2[55:48] then
 //                                 XMM1[55:48] -> XMM2[55:48]
 //                             // if XMM1[63:56] < XMM2[63:56] then
 //                                 XMM1[63:56] -> XMM2[63:56]
 //                             // if XMM1[71:64] < XMM2[71:64] then
 //                                 XMM1[71:64] -> XMM2[71:64]
 //                             // if XMM1[79:72] < XMM2[79:72] then
 //                                 XMM1[79:72] -> XMM2[79:72]
 //                             // if XMM1[87:80] < XMM2[87:80] then
 //                                 XMM1[87:80] -> XMM2[87:80]
 //                             // if XMM1[95:88] < XMM2[95:88] then
 //                                 XMM1[95:88] -> XMM2[95:88]
 //                             // if XMM1[103:96] < XMM2[103:96] then
 //                                 XMM1[103:96] -> XMM2[103:96]
 //                             // if XMM1[111:104] < XMM2[111:104] then
 //                                 XMM1[111:104] -> XMM2[111:104]
 //                             // if XMM1[119:112] < XMM2[119:112] then
 //                                 XMM1[119:112] -> XMM2[119:112]
 //                             // if XMM1[127:120] < XMM2[127:120] then
 //                                 XMM1[127:120] -> XMM2[127:120]
 //
 //  XMM1  XMM8  PMINUB,        // if XMM1[7:0] < XMM8[7:0] then
 //                                 XMM1[7:0] -> XMM8[7:0]
 //                             // if XMM1[15:8] < XMM8[15:8] then
 //                                 XMM1[15:8] -> XMM8[15:8]
 //                             // if XMM1[23:16] < XMM8[23:16] then
 //                                 XMM1[23:16] -> XMM8[23:16]
 //                             // if XMM1[31:24] < XMM8[31:24] then
 //                                 XMM1[31:24] -> XMM8[31:24]
 //                             // if XMM1[39:32] < XMM8[39:32] then
 //                                 XMM1[39:32] -> XMM8[39:32]
 //                             // if XMM1[47:40] < XMM8[47:40] then
 //                                 XMM1[47:40] -> XMM8[47:40]
 //                             // if XMM1[55:48] < XMM8[55:48] then
 //                                 XMM1[55:48] -> XMM8[55:48]
 //                             // if XMM1[63:56] < XMM8[63:56] then
 //                                 XMM1[63:56] -> XMM8[63:56]
 //                             // if XMM1[71:64] < XMM8[71:64] then
 //                                 XMM1[71:64] -> XMM8[71:64]
 //                             // if XMM1[79:72] < XMM8[79:72] then
 //                                 XMM1[79:72] -> XMM8[79:72]
 //                             // if XMM1[87:80] <> XMM8[87:80] then
 //                                 XMM1[87:80] -> XMM8[87:80]
 //                             // if XMM1[95:88] < XMM8[95:88] then
 //                                 XMM1[95:88] -> XMM8[95:88]
 //                             // if XMM1[103:96] < XMM8[103:96] then
 //                                 XMM1[103:96] -> XMM8[103:96]
 //                             // if XMM1[111:104] < XMM8[111:104] then
 //                                 XMM1[111:104] -> XMM8[111:104]
 //                             // if XMM1[119:112] < XMM8[119:112] then
 //                                 XMM1[119:112] -> XMM8[119:112]
 //                             // if XMM1[127:120] < XMM8[127:120] then
 //                                 XMM1[127:120] -> XMM8[127:120]
 //
 // Note:
 //  Only 1 target can be a memory target. The destination target must be an xmm
 //   register or floating point register. If the source is not memory, then it
 //   must be the same type of register as the destination.
 //
 // Failure cases:
 //  I didn't check the failure cases thoroughly, soo...
 //   you may see some strange things if you are not careful.
 //
 ///////////////////////////////////////////////////////////////////////////////////
 
 ///////////////////////////////////////////////////////////////////////////////////
 //
 // dg_forthpminudcomma ( PMINUD, )
 //
 // C prototype:
 //  void dg_forthpminudcomma (Bufferhandle* pBHarrayhead)
 //
 // Inputs:
 //  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
 //                                         which is used as the bufferhandle for
 //                                         the array where the other bufferhandles
 //                                         are stored.
 //
 // Stack action shorthand:
 //
 //  ( targetxparameterlist targetyparameterlist -- )
 //
 // Data stack in:
 //
 //  targetxparameterlist
 //  targetyparameterlist
 //
 //
 //  The parameter list for these targets can contain these addressing mode
 //   specifiers:
 //
 //   targetxmmregister
 //   baseregister [R]
 //   baseregister displacement [R+N]
 //   absoluteaddress [N]
 //   baseregister scale indexregister displacement [R+S*R+N]
 //
 //  In 64 bit addressing mode, the parameter list for a target can also
 //   contain this specifier:
 //
 //   currentcompilebufferoffset [O]
 //
 //  If you want more control over how the instruction is encoded,
 //   you can use these addressing mode specifiers instead:
 //
 //   targetxmmregister XMMR
 //   baseregister displacement minimumdisplacementsize [MOD]
 //   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
 //
 //  If you want to set the direction, you can use these:
 //   ->
 //   <-
 //
 //  Description of target parameters:
 //
 //   targetxmmregister            in both 32 and 64 bit address mode, one of:
 //                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
 //                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
 //   baseregister                 one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   displacement                 signed 32 bit value (even in 64BIT mode)
 //   absoluteaddress              signed 32 bit value (even in 64BIT mode)
 //   currentcompilebufferoffset   0 based offset in bytes from start of current
 //                                 compile buffer
 //   scale                        index register is multiplied by the scale,
 //                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
 //   indexregister                one of:
 //                                 NOREG or
 //                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
 //                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
 //                                  R8 R9 R10 R11 R12 R13 R14 R15
 //   [R]                          specifies a memory target at the address in
 //                                 the baseregister
 //   [R+N]                        specifies a memory target at the address at
 //                                 the value in the base register plus the
 //                                 signed displacement
 //   [N]                          specifies a memory target at an
 //                                 absoluteaddress which can't be larger than
 //                                 a signed 32 bit integer. This makes [N] not
 //                                 very useful in 64 bit mode
 //   [O]                          specifies a memory target at an offset in the
 //                                 current compile buffer.
 //   [R+S*R+N]                    specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //
 //   ->                        sets the direction to forward.
 //                                 This is the default value.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //   <-                        sets the direction to reverse.
 //                                 This is pushed after either addressing mode
 //                                 parameters and can not come in the middle
 //                                 of addressing mode parameters.
 //
 //   XMMR                         specifies an xmmr register target.
 //                                 XMMR is optional.
 //   [MOD]                        specifies a memory target at the address of
 //                                 baseregister plus displacement using modr/m
 //                                 encoding. The encoding may be promoted to
 //                                 sib if modr/m does not support it.
 //   [SIB]                        specifies a memory target at the address at
 //                                 baseregister + (scale*indexregister) +
 //                                 displacement (displacement is signed)
 //                                 sib encoding will be used.
 //
 // Data stack out:
 //  none
 //
 // Execute state action:
 //  Pulls two targets from the data stack and compiles the opcode sequence for
 //   an x86 PMINUD instruction. This sequence compares each unsigned 32 bit value
 //   of the source with each unsigned 32 bit value in the destination and if the
 //   source value is greater, it is copied to the destination.
 //
 // Compile state action:
 //  Compiles a call to a subroutine that does the execute state action.
 //
 // Examples:
 //  RBX [R]  XMM1  PMINUD,     // if [RBX][31:0] < XMM1[31:0] then
 //                                 [RBX][31:0] -> XMM1[31:0]
 //                             // if [RBX][63:32] < XMM1[63:32] then
 //                                 [RBX][63:32] -> XMM1[63:32]
 //                             // if [RBX][95:64] < XMM1[95:64] then
 //                                 [RBX][95:64] -> XMM1[95:64]
 //                             // if [RBX][127:96] < XMM1[127:96] then
 //                                 [RBX][127:96] -> XMM1[127:96]
 //
 //  XMM2  XMM1  PMINUD,        // if XMM2[31:0] < XMM1[31:0] then
 //                                 XMM2[31:0] -> XMM1[31:0]
 //                             // if XMM2[63:32] < XMM1[63:32] then
 //                                 XMM2[63:32] -> XMM1[63:32]
 //                             // if XMM2[95:64] < XMM1[95:64] then
 //                                 XMM2[95:64] -> XMM1[95:64]
 //                             // if XMM2[127:96] < XMM1[127:96] then
 //                                 XMM2[127:96] -> XMM1[127:96]
 //
 //  XMM2 <-  XMM1  PMINUD,  // if XMM1[31:0] < XMM2[31:0] then
 //                                 XMM1[31:0] -> XMM2[31:0]
 //                             // if XMM1[63:32] < XMM2[63:32] then
 //                                 XMM1[63:32] -> XMM2[63:32]
 //                             // if XMM1[95:64] < XMM2[95:64] then
 //                                 XMM1[95:64] -> XMM2[95:64]
 //                             // if XMM1[127:96] < XMM2[127:96] then
 //                                 XMM1[127:96] -> XMM2[127:96]
 //
 //  XMM1  XMM8  PMINUD,        // if XMM1[31:0] < XMM8[31:0] then
 //                                 XMM1[31:0] -> XMM8[31:0]
 //                             // if XMM1[63:32] < XMM8[63:32] then
 //                                 XMM1[63:32] -> XMM8[63:32]
 //                             // if XMM1[95:64] < XMM8[95:64] then
 //                                 XMM1[95:64] -> XMM8[95:64]
 //                             // if XMM1[127:96] < XMM8[127:96] then
 //                                 XMM1[127:96] -> XMM8[127:96]
 //
 // Note:
 //  Only 1 target can be a memory target. The destination target must be an xmm
 //   register.
 //
 // Failure cases:
 //  I didn't check the failure cases thoroughly, soo...
 //   you may see some strange things if you are not careful.
 //
 ///////////////////////////////////////////////////////////////////////////////////
 
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpminuwcomma ( PMINUW, )
//
// C prototype:
//  void dg_forthpminuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMINUW instruction. This sequence compares each unsigned 16 bit integer
//   of the source with each unsigned byte in the destination and if the source
//   value is less, it is copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMINUW,     // if [RBX][15:0] < XMM1[15:0] then
//                                 [RBX][15:0] -> XMM1[15:0]
//                             // if [RBX][31:16] < XMM1[31:16] then
//                                 [RBX][31:16] -> XMM1[31:16]
//                             // if [RBX][47:32] < XMM1[47:32] then
//                                 [RBX][47:32] -> XMM1[47:32]
//                             // if [RBX][63:48] < XMM1[63:48] then
//                                 [RBX][63:48] -> XMM1[63:48]
//                             // if [RBX][79:64] < XMM1[79:64] then
//                                 [RBX][79:64] -> XMM1[79:64]
//                             // if [RBX][95:80] < XMM1[95:80] then
//                                 [RBX][95:80] -> XMM1[95:80]
//                             // if [RBX][111:96] < XMM1[111:96] then
//                                 [RBX][111:96] -> XMM1[111:96]
//                             // if [RBX][127:112] < XMM1[127:112] then
//                                 [RBX][127:112] -> XMM1[127:112]
//
//  XMM2  XMM1  PMINUW,        // if XMM2[15:0] < XMM1[15:0] then
//                                 XMM2[15:0] -> XMM1[15:0]
//                             // if XMM2[31:16] < XMM1[31:16] then
//                                 XMM2[31:16] -> XMM1[31:16]
//                             // if XMM2[47:32] < XMM1[47:32] then
//                                 XMM2[47:32] -> XMM1[47:32]
//                             // if XMM2[63:48] < XMM1[63:48] then
//                                 XMM2[63:48] -> XMM1[63:48]
//                             // if XMM2[79:64] < XMM1[79:64] then
//                                 XMM2[79:64] -> XMM1[79:64]
//                             // if XMM2[95:80] < XMM1[95:80] then
//                                 XMM2[95:80] -> XMM1[95:80]
//                             // if XMM2[111:96] < XMM1[111:96] then
//                                 XMM2[111:96] -> XMM1[111:96]
//                             // if XMM2[127:112] < XMM1[127:112] then
//                                 XMM2[127:112] -> XMM1[127:112]
//
//  XMM2 <-  XMM1  PMINUW,  // if XMM1[15:0] < XMM2[15:0] then
//                                 XMM1[15:0] -> XMM2[15:0]
//                             // if XMM1[31:16] < XMM2[31:16] then
//                                 XMM1[31:16] -> XMM2[31:16]
//                             // if XMM1[47:32] < XMM2[47:32] then
//                                 XMM1[47:32] -> XMM2[47:32]
//                             // if XMM1[63:48] < XMM2[63:48] then
//                                 XMM1[63:48] -> XMM2[63:48]
//                             // if XMM1[79:64] < XMM2[79:64] then
//                                 XMM1[79:64] -> XMM2[79:64]
//                             // if XMM1[95:80] < XMM2[95:80] then
//                                 XMM1[95:80] -> XMM2[95:80]
//                             // if XMM1[111:96] < XMM2[111:96] then
//                                 XMM1[111:96] -> XMM2[111:96]
//                             // if XMM1[127:112] < XMM2[127:112] then
//                                 XMM1[127:112] -> XMM2[127:112]
//
//  XMM1  XMM8  PMINUW,        // if XMM1[15:0] < XMM8[15:0] then
//                                 XMM1[15:0] -> XMM8[15:0]
//                             // if XMM1[31:16] < XMM8[31:16] then
//                                 XMM1[31:16] -> XMM8[31:16]
//                             // if XMM1[47:32] < XMM8[47:32] then
//                                 XMM1[47:32] -> XMM8[47:32]
//                             // if XMM1[63:48] < XMM8[63:48] then
//                                 XMM1[63:48] -> XMM8[63:48]
//                             // if XMM1[79:64] < XMM8[79:64] then
//                                 XMM1[79:64] -> XMM8[79:64]
//                             // if XMM1[95:80] < XMM8[95:80] then
//                                 XMM1[95:80] -> XMM8[95:80]
//                             // if XMM1[111:96] < XMM8[111:96] then
//                                 XMM1[111:96] -> XMM8[111:96]
//                             // if XMM1[127:112] < XMM8[127:112] then
//                                 XMM1[127:112] -> XMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovsxbwcomma ( PMOVSXBW, )
//
// C prototype:
//  void dg_forthpmovsxbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVSXBW instruction. This sequence sign extends the eight signed
//   bytes in the lower 64 bits of the source to eight sixteen bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVSXBW,     // [RBX][7:0] INT8->INT16 -> XMM1[15:0]
//                               // [RBX][15:8] INT8->INT16 -> XMM1[31:16]
//                               // [RBX][23:16] INT8->INT16 -> XMM1[47:32]
//                               // [RBX][31:24] INT8->INT16 -> XMM1[63:48]
//                               // [RBX][39:32] INT8->INT16 -> XMM1[79:64]
//                               // [RBX][47:40] INT8->INT16 -> XMM1[95:80]
//                               // [RBX][55:48] INT8->INT16 -> XMM1[111:96]
//                               // [RBX][64:56] INT8->INT16 -> XMM1[127:112]
//
//  XMM2  XMM1  PMOVSXBW,        // XMM2[7:0] INT8->INT16 -> XMM1[15:0]
//                               // XMM2[15:8] INT8->INT16 -> XMM1[31:16]
//                               // XMM2[23:16] INT8->INT16 -> XMM1[47:32]
//                               // XMM2[31:24] INT8->INT16 -> XMM1[63:48]
//                               // XMM2[39:32] INT8->INT16 -> XMM1[79:64]
//                               // XMM2[47:40] INT8->INT16 -> XMM1[95:80]
//                               // XMM2[55:48] INT8->INT16 -> XMM1[111:96]
//                               // XMM2[64:56] INT8->INT16 -> XMM1[127:112]
//
//  XMM2 <-  XMM1  PMOVSXBW,  // XMM1[7:0] INT8->INT16 -> XMM2[15:0]
//                               // XMM1[15:8] INT8->INT16 -> XMM2[31:16]
//                               // XMM1[23:16] INT8->INT16 -> XMM2[47:32]
//                               // XMM1[31:24] INT8->INT16 -> XMM2[63:48]
//                               // XMM1[39:32] INT8->INT16 -> XMM2[79:64]
//                               // XMM1[47:40] INT8->INT16 -> XMM2[95:80]
//                               // XMM1[55:48] INT8->INT16 -> XMM2[111:96]
//                               // XMM1[64:56] INT8->INT16 -> XMM2[127:112]
//
//  XMM1  XMM8  PMOVSXBW,        // XMM1[7:0] INT8->INT16 -> XMM8[15:0]
//                               // XMM1[15:8] INT8->INT16 -> XMM8[31:16]
//                               // XMM1[23:16] INT8->INT16 -> XMM8[47:32]
//                               // XMM1[31:24] INT8->INT16 -> XMM8[63:48]
//                               // XMM1[39:32] INT8->INT16 -> XMM8[79:64]
//                               // XMM1[47:40] INT8->INT16 -> XMM8[95:80]
//                               // XMM1[55:48] INT8->INT16 -> XMM8[111:96]
//                               // XMM1[64:56] INT8->INT16 -> XMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovsxbwcomma ( VPMOVSXBW, )
//
// C prototype:
//  void dg_forthvpmovsxbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVSXBW instruction. This sequence sign extends the 8 or 16
//   signed bytes in lower half of the source to 8 or 16 16 bit signed integers 
//   and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVSXBW,    // [RBX][7:0]   INT8->INT16 -> XMM1[15:0]
//                               // [RBX][15:8]  INT8->INT16 -> XMM1[31:16]
//                               // [RBX][23:16] INT8->INT16 -> XMM1[47:32]
//                               // [RBX][31:24] INT8->INT16 -> XMM1[63:48]
//                               // [RBX][39:32] INT8->INT16 -> XMM1[79:64]
//                               // [RBX][47:40] INT8->INT16 -> XMM1[95:80]
//                               // [RBX][55:48] INT8->INT16 -> XMM1[111:96]
//                               // [RBX][64:56] INT8->INT16 -> XMM1[127:112]
//
//  XMM2  XMM1  VPMOVSXBW,       // XMM2[7:0]   INT8->INT16 -> XMM1[15:0]
//                               // XMM2[15:8]  INT8->INT16 -> XMM1[31:16]
//                               // XMM2[23:16] INT8->INT16 -> XMM1[47:32]
//                               // XMM2[31:24] INT8->INT16 -> XMM1[63:48]
//                               // XMM2[39:32] INT8->INT16 -> XMM1[79:64]
//                               // XMM2[47:40] INT8->INT16 -> XMM1[95:80]
//                               // XMM2[55:48] INT8->INT16 -> XMM1[111:96]
//                               // XMM2[64:56] INT8->INT16 -> XMM1[127:112]
//
//  XMM2 <-  XMM1  VPMOVSXBW, // XMM1[7:0]   INT8->INT16 -> XMM2[15:0]
//                               // XMM1[15:8]  INT8->INT16 -> XMM2[31:16]
//                               // XMM1[23:16] INT8->INT16 -> XMM2[47:32]
//                               // XMM1[31:24] INT8->INT16 -> XMM2[63:48]
//                               // XMM1[39:32] INT8->INT16 -> XMM2[79:64]
//                               // XMM1[47:40] INT8->INT16 -> XMM2[95:80]
//                               // XMM1[55:48] INT8->INT16 -> XMM2[111:96]
//                               // XMM1[63:56] INT8->INT16 -> XMM2[127:112]
//
//  XMM1  XMM8  VPMOVSXBW,       // XMM1[7:0]   INT8->INT16 -> XMM8[15:0]
//                               // XMM1[15:8]  INT8->INT16 -> XMM8[31:16]
//                               // XMM1[23:16] INT8->INT16 -> XMM8[47:32]
//                               // XMM1[31:24] INT8->INT16 -> XMM8[63:48]
//                               // XMM1[39:32] INT8->INT16 -> XMM8[79:64]
//                               // XMM1[47:40] INT8->INT16 -> XMM8[95:80]
//                               // XMM1[55:48] INT8->INT16 -> XMM8[111:96]
//                               // XMM1[63:56] INT8->INT16 -> XMM8[127:112]
//
//  YMM1  YMM8  VPMOVSXBW,       // YMM1[7:0]     INT8->INT16 -> YMM8[15:0]
//                               // YMM1[15:8]    INT8->INT16 -> YMM8[31:16]
//                               // YMM1[23:16]   INT8->INT16 -> YMM8[47:32]
//                               // YMM1[31:24]   INT8->INT16 -> YMM8[63:48]
//                               // YMM1[39:32]   INT8->INT16 -> YMM8[79:64]
//                               // YMM1[47:40]   INT8->INT16 -> YMM8[95:80]
//                               // YMM1[55:48]   INT8->INT16 -> YMM8[111:96]
//                               // YMM1[63:56]   INT8->INT16 -> YMM8[127:112]
//                               // YMM1[71:64]   INT8->INT16 -> YMM8[135:128]
//                               // YMM1[79:72]   INT8->INT16 -> YMM8[143:136]
//                               // ...
//                               // YMM1[127:120] INT8->INT16 -> YMM8[255:240]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovsxbdcomma ( PMOVSXBD, )
//
// C prototype:
//  void dg_forthpmovsxbdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVSXBD instruction. This sequence sign extends the four signed
//   bytes in the lower 32 bits of the source to four 32 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVSXBD,     // [RBX][7:0] INT8->INT32 -> XMM1[31:0]
//                               // [RBX][15:8] INT8->INT32 -> XMM1[63:32]
//                               // [RBX][23:16] INT8->INT32 -> XMM1[95:64]
//                               // [RBX][31:24] INT8->INT32 -> XMM1[127:96]
//
//  XMM2  XMM1  PMOVSXBD,        // XMM2[7:0] INT8->INT32 -> XMM1[31:0]
//                               // XMM2[15:8] INT8->INT32 -> XMM1[63:32]
//                               // XMM2[23:16] INT8->INT32 -> XMM1[95:64]
//                               // XMM2[31:24] INT8->INT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  PMOVSXBD,  // XMM1[7:0] INT8->INT32 -> XMM2[31:0]
//                               // XMM1[15:8] INT8->INT32 -> XMM2[63:32]
//                               // XMM1[23:16] INT8->INT32 -> XMM2[95:64]
//                               // XMM1[31:24] INT8->INT32 -> XMM2[127:96]
//
//  XMM1  XMM8  PMOVSXBD,        // XMM1[7:0] INT8->INT32 -> XMM8[31:0]
//                               // XMM1[15:8] INT8->INT32 -> XMM8[63:32]
//                               // XMM1[23:16] INT8->INT32 -> XMM8[95:64]
//                               // XMM1[31:24] INT8->INT32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovsxbdcomma ( VPMOVSXBD, )
//
// C prototype:
//  void dg_forthvpmovsxbdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVSXBD instruction. This sequence sign extends the 4 or 8 signed
//   bytes in the lower quarter of the source to 4 or 8 32 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVSXBD,    // [RBX][7:0] INT8->INT32 -> XMM1[31:0]
//                               // [RBX][15:8] INT8->INT32 -> XMM1[63:32]
//                               // [RBX][23:16] INT8->INT32 -> XMM1[95:64]
//                               // [RBX][31:24] INT8->INT32 -> XMM1[127:96]
//
//  XMM2  XMM1  VPMOVSXBD,       // XMM2[7:0] INT8->INT32 -> XMM1[31:0]
//                               // XMM2[15:8] INT8->INT32 -> XMM1[63:32]
//                               // XMM2[23:16] INT8->INT32 -> XMM1[95:64]
//                               // XMM2[31:24] INT8->INT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  VPMOVSXBD, // XMM1[7:0] INT8->INT32 -> XMM2[31:0]
//                               // XMM1[15:8] INT8->INT32 -> XMM2[63:32]
//                               // XMM1[23:16] INT8->INT32 -> XMM2[95:64]
//                               // XMM1[31:24] INT8->INT32 -> XMM2[127:96]
//
//  XMM1  XMM8  VPMOVSXBD,       // XMM1[7:0] INT8->INT32 -> XMM8[31:0]
//                               // XMM1[15:8] INT8->INT32 -> XMM8[63:32]
//                               // XMM1[23:16] INT8->INT32 -> XMM8[95:64]
//                               // XMM1[31:24] INT8->INT32 -> XMM8[127:96]
//
//  YMM1  YMM8  VPMOVSXBD,       // YMM1[7:0] INT8->INT32 -> YMM8[31:0]
//                               // YMM1[15:8] INT8->INT32 -> YMM8[63:32]
//                               // YMM1[23:16] INT8->INT32 -> YMM8[95:64]
//                               // YMM1[31:24] INT8->INT32 -> YMM8[127:96]
//                               // ...
//                               // YMM1[63:56] INT8->INT32 -> YMM8[255:240]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovsxbqcomma ( PMOVSXBQ, )
//
// C prototype:
//  void dg_forthpmovsxbqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVSXBQ instruction. This sequence sign extends the two signed
//   bytes in the lower 16 bits of the source to two 64 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVSXBQ,     // [RBX][7:0] INT8->INT64 -> XMM1[63:0]
//                               // [RBX][15:8] INT8->INT64 -> XMM1[127:64]
//
//  XMM2  XMM1  PMOVSXBQ,        // XMM2[7:0] INT8->INT64 -> XMM1[63:0]
//                               // XMM2[15:8] INT8->INT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PMOVSXBQ,  // XMM1[7:0] INT8->INT64 -> XMM2[63:0]
//                               // XMM1[15:8] INT8->INT64 -> XMM2[127:64]
//
//  XMM1  XMM8  PMOVSXBQ,        // XMM1[7:0] INT8->INT64 -> XMM8[63:0]
//                               // XMM1[15:8] INT8->INT64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovsxbqcomma ( VPMOVSXBQ, )
//
// C prototype:
//  void dg_forthvpmovsxbqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVSXBQ instruction. This sequence sign extends the 2 or 4 signed
//   bytes in the lower eight of the source to 2 or 4 64 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVSXBQ,    // [RBX][7:0]  INT8->INT64 -> XMM1[63:0]
//                               // [RBX][15:8] INT8->INT64 -> XMM1[127:64]
//
//  XMM2  XMM1  VPMOVSXBQ,       // XMM2[7:0]  INT8->INT64 -> XMM1[63:0]
//                               // XMM2[15:8] INT8->INT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  VPMOVSXBQ, // XMM1[7:0]  INT8->INT64 -> XMM2[63:0]
//                               // XMM1[15:8] INT8->INT64 -> XMM2[127:64]
//
//  XMM1  XMM8  VPMOVSXBQ,       // XMM1[7:0]  INT8->INT64 -> XMM8[63:0]
//                               // XMM1[15:8] INT8->INT64 -> XMM8[127:64]
//
//  YMM1  YMM8  VPMOVSXBQ,       // YMM1[7:0]   INT8->INT64 -> YMM8[63:0]
//                               // YMM1[15:8]  INT8->INT64 -> YMM8[127:64]
//                               // YMM1[23:16] INT8->INT64 -> YMM8[191:128]
//                               // YMM1[31:24] INT8->INT64 -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovsxwdcomma ( PMOVSXWD, )
//
// C prototype:
//  void dg_forthpmovsxwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVSXWD instruction. This sequence sign extends the four signed
//   16 bit integers in the lower 64 bits of the source to four 32 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVSXWD,     // [RBX][15:0] INT16->INT32 -> XMM1[31:0]
//                               // [RBX][31:16] INT16->INT32 -> XMM1[63:32]
//                               // [RBX][47:32] INT16->INT32 -> XMM1[95:64]
//                               // [RBX][63:48] INT16->INT32 -> XMM1[127:96]
//
//  XMM2  XMM1  PMOVSXWD,        // XMM2[15:0] INT16->INT32 -> XMM1[31:0]
//                               // XMM2[31:16] INT16->INT32 -> XMM1[63:32]
//                               // XMM2[47:32] INT16->INT32 -> XMM1[95:64]
//                               // XMM2[63:48] INT16->INT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  PMOVSXWD,  // XMM1[15:0] INT16->INT32 -> XMM2[31:0]
//                               // XMM1[31:16] INT16->INT32 -> XMM2[63:32]
//                               // XMM1[47:32] INT16->INT32 -> XMM2[95:64]
//                               // XMM1[63:48] INT16->INT32 -> XMM2[127:96]
//
//  XMM1  XMM8  PMOVSXWD,        // XMM1[15:0] INT16->INT32 -> XMM8[31:0]
//                               // XMM1[31:16] INT16->INT32 -> XMM8[63:32]
//                               // XMM1[47:32] INT16->INT32 -> XMM8[95:64]
//                               // XMM1[63:48] INT16->INT32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovsxwdcomma ( VPMOVSXWD, )
//
// C prototype:
//  void dg_forthvpmovsxwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVSXWD instruction. This sequence sign extends the 4 or 8 signed
//   16 bit integers in the lower half of the source to 4 or 8 32 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVSXWD,    // [RBX][15:0]  INT16->INT32 -> XMM1[31:0]
//                               // [RBX][31:16] INT16->INT32 -> XMM1[63:32]
//                               // [RBX][47:32] INT16->INT32 -> XMM1[95:64]
//                               // [RBX][63:48] INT16->INT32 -> XMM1[127:96]
//
//  XMM2  XMM1  VPMOVSXWD,       // XMM2[15:0]  INT16->INT32 -> XMM1[31:0]
//                               // XMM2[31:16] INT16->INT32 -> XMM1[63:32]
//                               // XMM2[47:32] INT16->INT32 -> XMM1[95:64]
//                               // XMM2[63:48] INT16->INT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  VPMOVSXWD, // XMM1[15:0]  INT16->INT32 -> XMM2[31:0]
//                               // XMM1[31:16] INT16->INT32 -> XMM2[63:32]
//                               // XMM1[47:32] INT16->INT32 -> XMM2[95:64]
//                               // XMM1[63:48] INT16->INT32 -> XMM2[127:96]
//
//  XMM1  XMM8  VPMOVSXWD,       // XMM1[15:0]  INT16->INT32 -> XMM8[31:0]
//                               // XMM1[31:16] INT16->INT32 -> XMM8[63:32]
//                               // XMM1[47:32] INT16->INT32 -> XMM8[95:64]
//                               // XMM1[63:48] INT16->INT32 -> XMM8[127:96]
//
//  YMM1  YMM8  VPMOVSXWD,       // YMM1[15:0]    INT16->INT32 -> YMM8[31:0]
//                               // YMM1[31:16]   INT16->INT32 -> YMM8[63:32]
//                               // YMM1[47:32]   INT16->INT32 -> YMM8[95:64]
//                               // YMM1[63:48]   INT16->INT32 -> YMM8[127:96]
//                               // ...
//                               // YMM1[127:112] INT16->INT32 -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovsxwqcomma ( PMOVSXWQ, )
//
// C prototype:
//  void dg_forthpmovsxwqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVSXWQ instruction. This sequence sign extends the two signed
//   16 bit integers in the lower 32 bits of the source to two 64 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVSXWQ,     // [RBX][15:0] INT16->INT64 -> XMM1[63:0]
//                               // [RBX][31:16] INT16->INT64 -> XMM1[127:64]
//
//  XMM2  XMM1  PMOVSXWQ,        // XMM2[15:0] INT16->INT64 -> XMM1[63:0]
//                               // XMM2[31:16] INT16->INT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PMOVSXWQ,  // XMM1[15:0] INT16->INT64 -> XMM2[63:0]
//                               // XMM1[31:16] INT16->INT64 -> XMM2[127:64]
//
//  XMM1  XMM8  PMOVSXWQ,        // XMM1[15:0] INT16->INT64 -> XMM8[63:0]
//                               // XMM1[31:16] INT16->INT64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovsxwqcomma ( VPMOVSXWQ, )
//
// C prototype:
//  void dg_forthvpmovsxwqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVSXWQ instruction. This sequence sign extends the 2 or 4 signed
//   16 bit integers in the lower quarter of the source to 2 or 4 64 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVSXWQ,    // [RBX][15:0]  INT16->INT64 -> XMM1[63:0]
//                               // [RBX][31:16] INT16->INT64 -> XMM1[127:64]
//
//  XMM2  XMM1  VPMOVSXWQ,       // XMM2[15:0]  INT16->INT64 -> XMM1[63:0]
//                               // XMM2[31:16] INT16->INT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  VPMOVSXWQ, // XMM1[15:0]  INT16->INT64 -> XMM2[63:0]
//                               // XMM1[31:16] INT16->INT64 -> XMM2[127:64]
//
//  XMM1  XMM8  VPMOVSXWQ,       // XMM1[15:0]  INT16->INT64 -> XMM8[63:0]
//                               // XMM1[31:16] INT16->INT64 -> XMM8[127:64]
//
//  YMM1  YMM8  VPMOVSXWQ,       // YMM1[15:0]  INT16->INT64 -> YMM8[63:0]
//                               // YMM1[31:16] INT16->INT64 -> YMM8[127:64]
//                               // YMM1[47:32] INT16->INT64 -> YMM8[191:128]
//                               // YMM1[63:48] INT16->INT64 -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovsxdqcomma ( PMOVSXDQ, )
//
// C prototype:
//  void dg_forthpmovsxdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVSXDQ instruction. This sequence sign extends the two signed
//   32 bit integers in the lower 64 bits of the source to two 64 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVSXDQ,     // [RBX][31:0] INT32->INT64 -> XMM1[63:0]
//                               // [RBX][63:32] INT32->INT64 -> XMM1[127:64]
//
//  XMM2  XMM1  PMOVSXDQ,        // XMM2[31:0] INT32->INT64 -> XMM1[63:0]
//                               // XMM2[63:32] INT32->INT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PMOVSXDQ,  // XMM1[31:0] INT32->INT64 -> XMM2[63:0]
//                               // XMM1[63:32] INT32->INT64 -> XMM2[127:64]
//
//  XMM1  XMM8  PMOVSXDQ,        // XMM1[31:0] INT32->INT64 -> XMM8[63:0]
//                               // XMM1[63:32] INT32->INT64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovsxdqcomma ( VPMOVSXDQ, )
//
// C prototype:
//  void dg_forthvpmovsxdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVSXDQ instruction. This sequence sign extends the 2 or 4 signed
//   32 bit integers in the lower half of the source to 2 or 4 64 bit signed
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVSXDQ,    // [RBX][31:0]  INT32->INT64 -> XMM1[63:0]
//                               // [RBX][63:32] INT32->INT64 -> XMM1[127:64]
//
//  XMM2  XMM1  VPMOVSXDQ,       // XMM2[31:0]   INT32->INT64 -> XMM1[63:0]
//                               // XMM2[63:32]  INT32->INT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  VPMOVSXDQ, // XMM1[31:0]   INT32->INT64 -> XMM2[63:0]
//                               // XMM1[63:32]  INT32->INT64 -> XMM2[127:64]
//
//  XMM1  XMM8  VPMOVSXDQ,       // XMM1[31:0]   INT32->INT64 -> XMM8[63:0]
//                               // XMM1[63:32]  INT32->INT64 -> XMM8[127:64]
//
//  YMM1  YMM8  VPMOVSXDQ,       // YMM1[31:0]   INT32->INT64 -> YMM8[63:0]
//                               // YMM1[63:32]  INT32->INT64 -> YMM8[127:64]
//                               // YMM1[95:64]  INT32->INT64 -> YMM8[191:128]
//                               // YMM1[127:96] INT32->INT64 -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovzxbwcomma ( PMOVZXBW, )
//
// C prototype:
//  void dg_forthpmovzxbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVZXBW instruction. This sequence zero extends the eight unsigned
//   bytes in the lower 64 bits of the source to eight sixteen bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVZXBW,     // [RBX][7:0] UINT8->UINT16 -> XMM1[15:0]
//                               // [RBX][15:8] UINT8->UINT16 -> XMM1[31:16]
//                               // [RBX][23:16] UINT8->UINT16 -> XMM1[47:32]
//                               // [RBX][31:24] UINT8->UINT16 -> XMM1[63:48]
//                               // [RBX][39:32] UINT8->UINT16 -> XMM1[79:64]
//                               // [RBX][47:40] UINT8->UINT16 -> XMM1[95:80]
//                               // [RBX][55:48] UINT8->UINT16 -> XMM1[111:96]
//                               // [RBX][64:56] UINT8->UINT16 -> XMM1[127:112]
//
//  XMM2  XMM1  PMOVZXBW,        // XMM2[7:0] UINT8->UINT16 -> XMM1[15:0]
//                               // XMM2[15:8] UINT8->UINT16 -> XMM1[31:16]
//                               // XMM2[23:16] UINT8->UINT16 -> XMM1[47:32]
//                               // XMM2[31:24] UINT8->UINT16 -> XMM1[63:48]
//                               // XMM2[39:32] UINT8->UINT16 -> XMM1[79:64]
//                               // XMM2[47:40] UINT8->UINT16 -> XMM1[95:80]
//                               // XMM2[55:48] UINT8->UINT16 -> XMM1[111:96]
//                               // XMM2[64:56] UINT8->UINT16 -> XMM1[127:112]
//
//  XMM2 <-  XMM1  PMOVZXBW,  // XMM1[7:0] UINT8->UINT16 -> XMM2[15:0]
//                               // XMM1[15:8] UINT8->UINT16 -> XMM2[31:16]
//                               // XMM1[23:16] UINT8->UINT16 -> XMM2[47:32]
//                               // XMM1[31:24] UINT8->UINT16 -> XMM2[63:48]
//                               // XMM1[39:32] UINT8->UINT16 -> XMM2[79:64]
//                               // XMM1[47:40] UINT8->UINT16 -> XMM2[95:80]
//                               // XMM1[55:48] UINT8->UINT16 -> XMM2[111:96]
//                               // XMM1[64:56] UINT8->UINT16 -> XMM2[127:112]
//
//  XMM1  XMM8  PMOVZXBW,        // XMM1[7:0] UINT8->UINT16 -> XMM8[15:0]
//                               // XMM1[15:8] UINT8->UINT16 -> XMM8[31:16]
//                               // XMM1[23:16] UINT8->UINT16 -> XMM8[47:32]
//                               // XMM1[31:24] UINT8->UINT16 -> XMM8[63:48]
//                               // XMM1[39:32] UINT8->UINT16 -> XMM8[79:64]
//                               // XMM1[47:40] UINT8->UINT16 -> XMM8[95:80]
//                               // XMM1[55:48] UINT8->UINT16 -> XMM8[111:96]
//                               // XMM1[64:56] UINT8->UINT16 -> XMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovzxbwcomma ( VPMOVZXBW, )
//
// C prototype:
//  void dg_forthvpmovzxbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVZXBW instruction. This sequence sign extends the 8 or 16
//   unsigned bytes in lower half of the source to 8 or 16 16 bit unsigned 
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVZXBW,    // [RBX][7:0]   UINT8->UINT16 -> XMM1[15:0]
//                               // [RBX][15:8]  UINT8->UINT16 -> XMM1[31:16]
//                               // [RBX][23:16] UINT8->UINT16 -> XMM1[47:32]
//                               // [RBX][31:24] UINT8->UINT16 -> XMM1[63:48]
//                               // [RBX][39:32] UINT8->UINT16 -> XMM1[79:64]
//                               // [RBX][47:40] UINT8->UINT16 -> XMM1[95:80]
//                               // [RBX][55:48] UINT8->UINT16 -> XMM1[111:96]
//                               // [RBX][64:56] UINT8->UINT16 -> XMM1[127:112]
//
//  XMM2  XMM1  VPMOVZXBW,       // XMM2[7:0]   UINT8->UINT16 -> XMM1[15:0]
//                               // XMM2[15:8]  UINT8->UINT16 -> XMM1[31:16]
//                               // XMM2[23:16] UINT8->UINT16 -> XMM1[47:32]
//                               // XMM2[31:24] UINT8->UINT16 -> XMM1[63:48]
//                               // XMM2[39:32] UINT8->UINT16 -> XMM1[79:64]
//                               // XMM2[47:40] UINT8->UINT16 -> XMM1[95:80]
//                               // XMM2[55:48] UINT8->UINT16 -> XMM1[111:96]
//                               // XMM2[64:56] UINT8->UINT16 -> XMM1[127:112]
//
//  XMM2 <-  XMM1  VPMOVZXBW, // XMM1[7:0]   UINT8->UINT16 -> XMM2[15:0]
//                               // XMM1[15:8]  UINT8->UINT16 -> XMM2[31:16]
//                               // XMM1[23:16] UINT8->UINT16 -> XMM2[47:32]
//                               // XMM1[31:24] UINT8->UINT16 -> XMM2[63:48]
//                               // XMM1[39:32] UINT8->UINT16 -> XMM2[79:64]
//                               // XMM1[47:40] UINT8->UINT16 -> XMM2[95:80]
//                               // XMM1[55:48] UINT8->UINT16 -> XMM2[111:96]
//                               // XMM1[63:56] UINT8->UINT16 -> XMM2[127:112]
//
//  XMM1  XMM8  VPMOVZXBW,       // XMM1[7:0]   UINT8->UINT16 -> XMM8[15:0]
//                               // XMM1[15:8]  UINT8->UINT16 -> XMM8[31:16]
//                               // XMM1[23:16] UINT8->UINT16 -> XMM8[47:32]
//                               // XMM1[31:24] UINT8->UINT16 -> XMM8[63:48]
//                               // XMM1[39:32] UINT8->UINT16 -> XMM8[79:64]
//                               // XMM1[47:40] UINT8->UINT16 -> XMM8[95:80]
//                               // XMM1[55:48] UINT8->UINT16 -> XMM8[111:96]
//                               // XMM1[63:56] UINT8->UINT16 -> XMM8[127:112]
//
//  YMM1  YMM8  VPMOVZXBW,       // YMM1[7:0]     UINT8->UINT16 -> YMM8[15:0]
//                               // YMM1[15:8]    UINT8->UINT16 -> YMM8[31:16]
//                               // YMM1[23:16]   UINT8->UINT16 -> YMM8[47:32]
//                               // YMM1[31:24]   UINT8->UINT16 -> YMM8[63:48]
//                               // YMM1[39:32]   UINT8->UINT16 -> YMM8[79:64]
//                               // YMM1[47:40]   UINT8->UINT16 -> YMM8[95:80]
//                               // YMM1[55:48]   UINT8->UINT16 -> YMM8[111:96]
//                               // YMM1[63:56]   UINT8->UINT16 -> YMM8[127:112]
//                               // YMM1[71:64]   UINT8->UINT16 -> YMM8[135:128]
//                               // YMM1[79:72]   UINT8->UINT16 -> YMM8[143:136]
//                               // ...
//                               // YMM1[127:120] UINT8->UINT16 -> YMM8[255:240]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovzxbdcomma ( PMOVZXBD, )
//
// C prototype:
//  void dg_forthpmovzxbdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVZXBD instruction. This sequence sign extends the four unsigned
//   bytes in the lower 32 bits of the source to four 32 bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVZXBD,     // [RBX][7:0] UINT8->UINT32 -> XMM1[31:0]
//                               // [RBX][15:8] UINT8->UINT32 -> XMM1[63:32]
//                               // [RBX][23:16] UINT8->UINT32 -> XMM1[95:64]
//                               // [RBX][31:24] UINT8->UINT32 -> XMM1[127:96]
//
//  XMM2  XMM1  PMOVZXBD,        // XMM2[7:0] UINT8->UINT32 -> XMM1[31:0]
//                               // XMM2[15:8] UINT8->UINT32 -> XMM1[63:32]
//                               // XMM2[23:16] UINT8->UINT32 -> XMM1[95:64]
//                               // XMM2[31:24] UINT8->UINT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  PMOVZXBD,  // XMM1[7:0] UINT8->UINT32 -> XMM2[31:0]
//                               // XMM1[15:8] UINT8->UINT32 -> XMM2[63:32]
//                               // XMM1[23:16] UINT8->UINT32 -> XMM2[95:64]
//                               // XMM1[31:24] UINT8->UINT32 -> XMM2[127:96]
//
//  XMM1  XMM8  PMOVZXBD,        // XMM1[7:0] UINT8->UINT32 -> XMM8[31:0]
//                               // XMM1[15:8] UINT8->UINT32 -> XMM8[63:32]
//                               // XMM1[23:16] UINT8->UINT32 -> XMM8[95:64]
//                               // XMM1[31:24] UINT8->UINT32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovzxbdcomma ( VPMOVZXBD, )
//
// C prototype:
//  void dg_forthvpmovzxbdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVZXBD instruction. This sequence sign extends the 4 or 8 
//   unsigned bytes in the lower quarter of the source to 4 or 8 32 bit 
//   unsigned integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVZXBD,    // [RBX][7:0] UINT8->UINT32 -> XMM1[31:0]
//                               // [RBX][15:8] UINT8->UINT32 -> XMM1[63:32]
//                               // [RBX][23:16] UINT8->UINT32 -> XMM1[95:64]
//                               // [RBX][31:24] UINT8->UINT32 -> XMM1[127:96]
//
//  XMM2  XMM1  VPMOVZXBD,       // XMM2[7:0] UINT8->UINT32 -> XMM1[31:0]
//                               // XMM2[15:8] UINT8->UINT32 -> XMM1[63:32]
//                               // XMM2[23:16] UINT8->UINT32 -> XMM1[95:64]
//                               // XMM2[31:24] UINT8->UINT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  VPMOVZXBD, // XMM1[7:0] UINT8->UINT32 -> XMM2[31:0]
//                               // XMM1[15:8] UINT8->UINT32 -> XMM2[63:32]
//                               // XMM1[23:16] UINT8->UINT32 -> XMM2[95:64]
//                               // XMM1[31:24] UINT8->UINT32 -> XMM2[127:96]
//
//  XMM1  XMM8  VPMOVZXBD,       // XMM1[7:0] UINT8->UINT32 -> XMM8[31:0]
//                               // XMM1[15:8] UINT8->UINT32 -> XMM8[63:32]
//                               // XMM1[23:16] UINT8->UINT32 -> XMM8[95:64]
//                               // XMM1[31:24] UINT8->UINT32 -> XMM8[127:96]
//
//  YMM1  YMM8  VPMOVZXBD,       // YMM1[7:0] UINT8->UINT32 -> YMM8[31:0]
//                               // YMM1[15:8] UINT8->UINT32 -> YMM8[63:32]
//                               // YMM1[23:16] UINT8->UINT32 -> YMM8[95:64]
//                               // YMM1[31:24] UINT8->UINT32 -> YMM8[127:96]
//                               // ...
//                               // YMM1[63:56] UINT8->UINT32 -> YMM8[255:240]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovzxbqcomma ( PMOVZXBQ, )
//
// C prototype:
//  void dg_forthpmovzxbqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVZXBQ instruction. This sequence zero extends the two unsigned
//   bytes in the lower 16 bits of the source to two 64 bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVZXBQ,     // [RBX][7:0] UINT8->UINT64 -> XMM1[63:0]
//                               // [RBX][15:8] UINT8->UINT64 -> XMM1[127:64]
//
//  XMM2  XMM1  PMOVZXBQ,        // XMM2[7:0] UINT8->UINT64 -> XMM1[63:0]
//                               // XMM2[15:8] UINT8->UINT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PMOVZXBQ,  // XMM1[7:0] UINT8->UINT64 -> XMM2[63:0]
//                               // XMM1[15:8] UINT8->UINT64 -> XMM2[127:64]
//
//  XMM1  XMM8  PMOVZXBQ,        // XMM1[7:0] UINT8->UINT64 -> XMM8[63:0]
//                               // XMM1[15:8] UINT8->UINT64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovzxbqcomma ( VPMOVZXBQ, )
//
// C prototype:
//  void dg_forthvpmovzxbqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVZXBQ instruction. This sequence sign extends the 2 or 4 
//   unsigned bytes in the lower eight of the source to 2 or 4 64 bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVZXBQ,    // [RBX][7:0]  UINT8->UINT64 -> XMM1[63:0]
//                               // [RBX][15:8] UINT8->UINT64 -> XMM1[127:64]
//
//  XMM2  XMM1  VPMOVZXBQ,       // XMM2[7:0]  UINT8->UINT64 -> XMM1[63:0]
//                               // XMM2[15:8] UINT8->UINT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  VPMOVZXBQ, // XMM1[7:0]  UINT8->UINT64 -> XMM2[63:0]
//                               // XMM1[15:8] UINT8->UINT64 -> XMM2[127:64]
//
//  XMM1  XMM8  VPMOVZXBQ,       // XMM1[7:0]  UINT8->UINT64 -> XMM8[63:0]
//                               // XMM1[15:8] UINT8->UINT64 -> XMM8[127:64]
//
//  YMM1  YMM8  VPMOVZXBQ,       // YMM1[7:0]   UINT8->UINT64 -> YMM8[63:0]
//                               // YMM1[15:8]  UINT8->UINT64 -> YMM8[127:64]
//                               // YMM1[23:16] UINT8->UINT64 -> YMM8[191:128]
//                               // YMM1[31:24] UINT8->UINT64 -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovzxwdcomma ( PMOVZXWD, )
//
// C prototype:
//  void dg_forthpmovzxwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVZXWD instruction. This sequence zero extends the four unsigned
//   16 bit integers in the lower 64 bits of the source to four 32 bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVZXWD,     // [RBX][15:0] UINT16->UINT32 -> XMM1[31:0]
//                               // [RBX][31:16] UINT16->UINT32 -> XMM1[63:32]
//                               // [RBX][47:32] UINT16->UINT32 -> XMM1[95:64]
//                               // [RBX][63:48] UINT16->UINT32 -> XMM1[127:96]
//
//  XMM2  XMM1  PMOVZXWD,        // XMM2[15:0] UINT16->UINT32 -> XMM1[31:0]
//                               // XMM2[31:16] UINT16->UINT32 -> XMM1[63:32]
//                               // XMM2[47:32] UINT16->UINT32 -> XMM1[95:64]
//                               // XMM2[63:48] UINT16->UINT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  PMOVZXWD,  // XMM1[15:0] UINT16->UINT32 -> XMM2[31:0]
//                               // XMM1[31:16] UINT16->UINT32 -> XMM2[63:32]
//                               // XMM1[47:32] UINT16->UINT32 -> XMM2[95:64]
//                               // XMM1[63:48] UINT16->UINT32 -> XMM2[127:96]
//
//  XMM1  XMM8  PMOVZXWD,        // XMM1[15:0] UINT16->UINT32 -> XMM8[31:0]
//                               // XMM1[31:16] UINT16->UINT32 -> XMM8[63:32]
//                               // XMM1[47:32] UINT16->UINT32 -> XMM8[95:64]
//                               // XMM1[63:48] UINT16->UINT32 -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovzxwdcomma ( VPMOVZXWD, )
//
// C prototype:
//  void dg_forthvpmovzxwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVZXWD instruction. This sequence sign extends the 4 or 8 
//   unsigned 16 bit integers in the lower half of the source to 4 or 8 32 bit 
//   unsigned integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVZXWD,    // [RBX][15:0]  UINT16->UINT32 -> XMM1[31:0]
//                               // [RBX][31:16] UINT16->UINT32 -> XMM1[63:32]
//                               // [RBX][47:32] UINT16->UINT32 -> XMM1[95:64]
//                               // [RBX][63:48] UINT16->UINT32 -> XMM1[127:96]
//
//  XMM2  XMM1  VPMOVZXWD,       // XMM2[15:0]  UINT16->UINT32 -> XMM1[31:0]
//                               // XMM2[31:16] UINT16->UINT32 -> XMM1[63:32]
//                               // XMM2[47:32] UINT16->UINT32 -> XMM1[95:64]
//                               // XMM2[63:48] UINT16->UINT32 -> XMM1[127:96]
//
//  XMM2 <-  XMM1  VPMOVZXWD, // XMM1[15:0]  UINT16->UINT32 -> XMM2[31:0]
//                               // XMM1[31:16] UINT16->UINT32 -> XMM2[63:32]
//                               // XMM1[47:32] UINT16->UINT32 -> XMM2[95:64]
//                               // XMM1[63:48] UINT16->UINT32 -> XMM2[127:96]
//
//  XMM1  XMM8  VPMOVZXWD,       // XMM1[15:0]  UINT16->UINT32 -> XMM8[31:0]
//                               // XMM1[31:16] UINT16->UINT32 -> XMM8[63:32]
//                               // XMM1[47:32] UINT16->UINT32 -> XMM8[95:64]
//                               // XMM1[63:48] UINT16->UINT32 -> XMM8[127:96]
//
//  YMM1  YMM8  VPMOVZXWD,       // YMM1[15:0]    UINT16->UINT32 -> YMM8[31:0]
//                               // YMM1[31:16]   UINT16->UINT32 -> YMM8[63:32]
//                               // YMM1[47:32]   UINT16->UINT32 -> YMM8[95:64]
//                               // YMM1[63:48]   UINT16->UINT32 -> YMM8[127:96]
//                               // ...
//                               // YMM1[127:112] UINT16->UINT32 -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovzxwqcomma ( PMOVZXWQ, )
//
// C prototype:
//  void dg_forthpmovzxwqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVZXWQ instruction. This sequence zero extends the two unsigned
//   16 bit integers in the lower 32 bits of the source to two 64 bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVZXWQ,     // [RBX][15:0] UINT16->UINT64 -> XMM1[63:0]
//                               // [RBX][31:16] UINT16->UINT64 -> XMM1[127:64]
//
//  XMM2  XMM1  PMOVZXWQ,        // XMM2[15:0] UINT16->UINT64 -> XMM1[63:0]
//                               // XMM2[31:16] UINT16->UINT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PMOVZXWQ,  // XMM1[15:0] UINT16->UINT64 -> XMM2[63:0]
//                               // XMM1[31:16] UINT16->UINT64 -> XMM2[127:64]
//
//  XMM1  XMM8  PMOVZXWQ,        // XMM1[15:0] UINT16->UINT64 -> XMM8[63:0]
//                               // XMM1[31:16] UINT16->UINT64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovzxwqcomma ( VPMOVZXWQ, )
//
// C prototype:
//  void dg_forthvpmovzxwqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVZXWQ instruction. This sequence sign extends the 2 or 4 unsigned
//   16 bit integers in the lower quarter of the source to 2 or 4 64 bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVZXWQ,    // [RBX][15:0]  UINT16->UINT64 -> XMM1[63:0]
//                               // [RBX][31:16] UINT16->UINT64 -> XMM1[127:64]
//
//  XMM2  XMM1  VPMOVZXWQ,       // XMM2[15:0]  UINT16->UINT64 -> XMM1[63:0]
//                               // XMM2[31:16] UINT16->UINT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  VPMOVZXWQ, // XMM1[15:0]  UINT16->UINT64 -> XMM2[63:0]
//                               // XMM1[31:16] UINT16->UINT64 -> XMM2[127:64]
//
//  XMM1  XMM8  VPMOVZXWQ,       // XMM1[15:0]  UINT16->UINT64 -> XMM8[63:0]
//                               // XMM1[31:16] UINT16->UINT64 -> XMM8[127:64]
//
//  YMM1  YMM8  VPMOVZXWQ,       // YMM1[15:0]  UINT16->UINT64 -> YMM8[63:0]
//                               // YMM1[31:16] UINT16->UINT64 -> YMM8[127:64]
//                               // YMM1[47:32] UINT16->UINT64 -> YMM8[191:128]
//                               // YMM1[63:48] UINT16->UINT64 -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovzxdqcomma ( PMOVZXDQ, )
//
// C prototype:
//  void dg_forthpmovzxdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVZXDQ instruction. This sequence zero extends the two unsigned
//   32 bit integers in the lower 64 bits of the source to two 64 bit unsigned
//   integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMOVZXDQ,     // [RBX][31:0] UINT32->UINT64 -> XMM1[63:0]
//                               // [RBX][63:32] UINT32->UINT64 -> XMM1[127:64]
//
//  XMM2  XMM1  PMOVZXDQ,        // XMM2[31:0] UINT32->UINT64 -> XMM1[63:0]
//                               // XMM2[63:32] UINT32->UINT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PMOVZXDQ,  // XMM1[31:0] UINT32->UINT64 -> XMM2[63:0]
//                               // XMM1[63:32] UINT32->UINT64 -> XMM2[127:64]
//
//  XMM1  XMM8  PMOVZXDQ,        // XMM1[31:0] UINT32->UINT64 -> XMM8[63:0]
//                               // XMM1[63:32] UINT32->UINT64 -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovzxdqcomma ( VPMOVZXDQ, )
//
// C prototype:
//  void dg_forthvpmovzxdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVZXDQ instruction. This sequence sign extends the 2 or 4 
//   unsigned 32 bit integers in the lower half of the source to 2 or 4 64 bit 
//   unsigned integers and puts them into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPMOVZXDQ,    // [RBX][31:0]  UINT32->UINT64 -> XMM1[63:0]
//                               // [RBX][63:32] UINT32->UINT64 -> XMM1[127:64]
//
//  XMM2  XMM1  VPMOVZXDQ,       // XMM2[31:0]   UINT32->UINT64 -> XMM1[63:0]
//                               // XMM2[63:32]  UINT32->UINT64 -> XMM1[127:64]
//
//  XMM2 <-  XMM1  VPMOVZXDQ, // XMM1[31:0]   UINT32->UINT64 -> XMM2[63:0]
//                               // XMM1[63:32]  UINT32->UINT64 -> XMM2[127:64]
//
//  XMM1  XMM8  VPMOVZXDQ,       // XMM1[31:0]   UINT32->UINT64 -> XMM8[63:0]
//                               // XMM1[63:32]  UINT32->UINT64 -> XMM8[127:64]
//
//  YMM1  YMM8  VPMOVZXDQ,       // YMM1[31:0]   UINT32->UINT64 -> YMM8[63:0]
//                               // YMM1[63:32]  UINT32->UINT64 -> YMM8[127:64]
//                               // YMM1[95:64]  UINT32->UINT64 -> YMM8[191:128]
//                               // YMM1[127:96] UINT32->UINT64 -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmuldqcomma ( PMULDQ, )
//
// C prototype:
//  void dg_forthpmuldqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMULDQ instruction. This sequence multiplies the signed 32 bit
//   integer in the low 32 bits of the source by the signed 32 bit integer in
/    the low 32 bits of the destination and puts the result into the lower 64
//   bits of the destination. This sequence also multiplies the signed 32 bit
//   integer in the low middle 32 bits of the destination by the signed 32 bit
//   integer in the low middle 32 bits of the source and puts the result into
//   the upper 64 bits of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMULDQ,     // [RBX][31:0] * XMM1[31:0] -> XMM1[63:0]
//                             // [RBX][95:64] * XMM1[95:64] -> XMM1[127:64]
//
//  XMM2  XMM1  PMULDQ,        // XMM2[31:0] * XMM1[31:0] -> XMM1[63:0]
//                             // XMM2[95:64] * XMM1[95:64] -> XMM1[127:64]
//
//  XMM2 <-  XMM1  PMULDQ,  // XMM1[31:0] * XMM2[31:0] -> XMM2[63:0]
//                             // XMM1[95:64] * XMM2[95:64] -> XMM2[127:64]
//
//  XMM1  XMM8  PMULDQ,        // XMM1[31:0] * XMM8[31:0] -> XMM8[63:0]
//                             // XMM1[95:64] * XMM8[95:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmuldqcomma ( VPMULDQ, )
//
// C prototype:
//  void dg_forthvpmuldqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMULDQ instruction. This sequence multiplies the signed 32 bit
//   integer in the low 32 bits of the source by the signed 32 bit integer in
/    the low 32 bits of target y and puts the result into the lower 64
//   bits of the destination. This sequence also multiplies the signed 32 bit
//   integer in the low middle 32 bits of target y by the signed 32 bit
//   integer in the low middle 32 bits of the source and puts the result into
//   the upper 64 bits of the destination. If the destination is a ymm register
//   then this opcode sequence does the same thing with the upper 128 bits of
//   target y, the source, and the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMULDQ, // [RBX][31:0]  * XMM0[31:0]  -> XMM1[63:0]
//                                // [RBX][95:64] * XMM0[95:64] -> XMM1[127:64]
//
//  XMM2  XMM0  XMM1  VPMULDQ,  // XMM2[31:0]  * XMM0[31:0]  -> XMM1[63:0]
//                              // XMM2[95:64] * XMM0[95:64] -> XMM1[127:64]
//
//  XMM2 <-  XMM0  XMM1  VPMULDQ, 
//                             // XMM1[31:0]  * XMM0[31:0]  -> XMM2[63:0]
//                             // XMM1[95:64] * XMM0[95:64] -> XMM2[127:64]
//
//  XMM1  XMM0  XMM8  VPMULDQ, // XMM1[31:0]  * XMM0[31:0] -> XMM8[63:0]
//                             // XMM1[95:64] * XMM0[95:64] -> XMM8[127:64]
//
//  YMM1  YMM0  YMM8  VPMULDQ, // YMM1[31:0]    * YMM0[31:0]    -> YMM8[63:0]
//                             // YMM1[95:64]   * YMM0[95:64]   -> YMM8[127:64]
//                             // YMM1[159:128] * YMM0[159:128] -> YMM8[191:128]
//                             // YMM1[223:192] * YMM0[223:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmulhrswcomma ( PMULHRSW, )
//
// C prototype:
//  void dg_forthpmulhrswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMULHRSW instruction. This sequence multiplies each 16 bit signed
//   integer in the source with the corresponding 16 bit signed integer in the
//   destination, rounds the result to the 16 most significant bits,
//   and puts the results into the destination. The exact calculation is this:
//   The upper 16 bits of 0.5 +
//   ((DEST[15:0] * SRC[15:0]) shifted to the right 15 bits) gets stored to the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMULHRSW,     // 0.5 + ([RBX][15:0] * XMM1[15:0]) / (2^15)
//                                   -> XMM1[15:0]
//                               // 0.5 + ([RBX][31:16] * XMM1[31:16]) / (2^15)
//                                   -> XMM1[31:16]
//                               // 0.5 + ([RBX][47:32] * XMM1[47:32]) / (2^15)
//                                   -> XMM1[47:32]
//                               // 0.5 + ([RBX][63:48] * XMM1[63:48]) / (2^15)
//                                   -> XMM1[63:48]
//                               // 0.5 + ([RBX][79:64] * XMM1[79:64]) / (2^15)
//                                   -> XMM1[79:64]
//                               // 0.5 + ([RBX][95:80] * XMM1[95:80]) / (2^15)
//                                   -> XMM1[95:80]
//                               // 0.5 + ([RBX][111:96] * XMM1[111:96]) / (2^15)
//                                   -> XMM1[111:96]
//                               // 0.5 + ([RBX][127:112] * XMM1[127:112]) / (2^15)
//                                   -> XMM1[127:112]
//
//  RBX [R]  ST1  PMULHRSW,      // 0.5 + ([RBX][15:0] * ST1[15:0]) / (2^15)
//                                   -> ST1[15:0]
//                               // 0.5 + ([RBX][31:16] * ST1[31:16]) / (2^15)
//                                   -> ST1[31:16]
//                               // 0.5 + ([RBX][47:32] * ST1[47:32]) / (2^15)
//                                   -> ST1[47:32]
//                               // 0.5 + ([RBX][63:48] * ST1[63:48]) / (2^15)
//                                   -> ST1[63:48]
//
//  XMM2  XMM1  PMULHRSW,        // 0.5 + (XMM2[15:0] * XMM1[15:0]) / (2^15)
//                                   -> XMM1[15:0]
//                               // 0.5 + (XMM2[31:16] * XMM1[31:16]) / (2^15)
//                                   -> XMM1[31:16]
//                               // 0.5 + (XMM2[47:32] * XMM1[47:32]) / (2^15)
//                                   -> XMM1[47:32]
//                               // 0.5 + (XMM2[63:48] * XMM1[63:48]) / (2^15)
//                                   -> XMM1[63:48]
//                               // 0.5 + (XMM2[79:64] * XMM1[79:64]) / (2^15)
//                                   -> XMM1[79:64]
//                               // 0.5 + (XMM2[95:80] * XMM1[95:80]) / (2^15)
//                                   -> XMM1[95:80]
//                               // 0.5 + (XMM2[111:96] * XMM1[111:96]) / (2^15)
//                                   -> XMM1[111:96]
//                               // 0.5 + (XMM2[127:112] * XMM1[127:112]) / (2^15)
//                                   -> XMM1[127:112]
//
//  ST2  ST1  PMULHRSW,          // 0.5 + (ST2[15:0] * ST1[15:0]) / (2^15)
//                                   -> ST1[15:0]
//                               // 0.5 + (ST2[31:16] * ST1[31:16]) / (2^15)
//                                   -> ST1[31:16]
//                               // 0.5 + (ST2[47:32] * ST1[47:32]) / (2^15)
//                                   -> ST1[47:32]
//                               // 0.5 + (ST2[63:48] * ST1[63:48]) / (2^15)
//                                   -> ST1[63:48]
//
//  XMM1 <-  XMM2  PMULHRSW,  // 0.5 + (XMM2[15:0] * XMM1[15:0]) / (2^15)
//                                   -> XMM1[15:0]
//                               // 0.5 + (XMM2[31:16] * XMM1[31:16]) / (2^15)
//                                   -> XMM1[31:16]
//                               // 0.5 + (XMM2[47:32] * XMM1[47:32]) / (2^15)
//                                   -> XMM1[47:32]
//                               // 0.5 + (XMM2[63:48] * XMM1[63:48]) / (2^15)
//                                   -> XMM1[63:48]
//                               // 0.5 + (XMM2[79:64] * XMM1[79:64]) / (2^15)
//                                   -> XMM1[79:64]
//                               // 0.5 + (XMM2[95:80] * XMM1[95:80]) / (2^15)
//                                   -> XMM1[95:80]
//                               // 0.5 + (XMM2[111:96] * XMM1[111:96]) / (2^15)
//                                   -> XMM1[111:96]
//                               // 0.5 + (XMM2[127:112] * XMM1[127:112]) / (2^15)
//                                   -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmulhrswcomma ( VPMULHRSW, )
//
// C prototype:
//  void dg_forthvpmulhrswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PMULHRSW instruction. This sequence multiplies each 16 bit signed
//   integer in the source with the corresponding 16 bit signed integer in 
//   target y, rounds the result to the 16 most significant bits,
//   and puts the results into the destination. The exact calculation is this:
//   The upper 16 bits of 0.5 +
//   ((DEST[15:0] * SRC[15:0]) shifted to the right 15 bits) gets stored to the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  PMULHRSW, 
//                               // 0.5 + ([RBX][15:0] * XMM0[15:0]) / (2^15)
//                                   -> XMM1[15:0]
//                               // 0.5 + ([RBX][31:16] * XMM0[31:16]) / (2^15)
//                                   -> XMM1[31:16]
//                               // 0.5 + ([RBX][47:32] * XMM0[47:32]) / (2^15)
//                                   -> XMM1[47:32]
//                               // 0.5 + ([RBX][63:48] * XMM0[63:48]) / (2^15)
//                                   -> XMM1[63:48]
//                               // 0.5 + ([RBX][79:64] * XMM0[79:64]) / (2^15)
//                                   -> XMM1[79:64]
//                               // 0.5 + ([RBX][95:80] * XMM0[95:80]) / (2^15)
//                                   -> XMM1[95:80]
//                               // 0.5 + ([RBX][111:96] * XMM0[111:96]) / (2^15)
//                                   -> XMM1[111:96]
//                               // 0.5 + ([RBX][127:112] * XMM0[127:112]) / (2^15)
//                                   -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  PMULHRSW,  // 0.5 + (YMM2[15:0] * YMM0[15:0]) / (2^15)
//                                   -> YMM1[15:0]
//                               // 0.5 + (YMM2[31:16] * YMM0[31:16]) / (2^15)
//                                   -> YMM1[31:16]
//                               // 0.5 + (YMM2[47:32] * YMM0[47:32]) / (2^15)
//                                   -> YMM1[47:32]
//                               // 0.5 + (YMM2[63:48] * YMM0[63:48]) / (2^15)
//                                   -> YMM1[63:48]
//                               // 0.5 + (YMM2[79:64] * YMM0[79:64]) / (2^15)
//                                   -> YMM1[79:64]
//                               // 0.5 + (YMM2[95:80] * YMM0[95:80]) / (2^15)
//                                   -> YMM1[95:80]
//                               // 0.5 + (YMM2[111:96] * YMM0[111:96]) / (2^15)
//                                   -> YMM1[111:96]
//                               // 0.5 + (YMM2[127:112] * YMM0[127:112]) / (2^15)
//                                   -> YMM1[127:112]
//                               // ... 
//                               // 0.5 + (YMM2[255:240] * YMM0[255:240]) / (2^15)
//                                   -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  PMULHRSW,  
//                               // 0.5 + (XMM2[15:0] * XMM0[15:0]) / (2^15)
//                                   -> XMM1[15:0]
//                               // 0.5 + (XMM2[31:16] * XMM0[31:16]) / (2^15)
//                                   -> XMM1[31:16]
//                               // 0.5 + (XMM2[47:32] * XMM0[47:32]) / (2^15)
//                                   -> XMM1[47:32]
//                               // 0.5 + (XMM2[63:48] * XMM0[63:48]) / (2^15)
//                                   -> XMM1[63:48]
//                               // 0.5 + (XMM2[79:64] * XMM0[79:64]) / (2^15)
//                                   -> XMM1[79:64]
//                               // 0.5 + (XMM2[95:80] * XMM0[95:80]) / (2^15)
//                                   -> XMM1[95:80]
//                               // 0.5 + (XMM2[111:96] * XMM0[111:96]) / (2^15)
//                                   -> XMM1[111:96]
//                               // 0.5 + (XMM2[127:112] * XMM0[127:112]) / (2^15)
//                                   -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmulhuwcomma ( PMULHUW, )
//
// C prototype:
//  void dg_forthpmulhuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMULHUW instruction. This sequence multiplies each 16 bit unsigned
//   integer in the source with the corresponding 16 but unsigned integer in the
//   destination and puts the upper 16 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMULHUW,      // ([RBX][15:0] * XMM1[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                               // ([RBX][31:16] * XMM1[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                               // ([RBX][47:32] * XMM1[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                               // ([RBX][63:48] * XMM1[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                               // ([RBX][79:64] * XMM1[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                               // ([RBX][95:80] * XMM1[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                               // ([RBX][111:96] * XMM1[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                               // ([RBX][127:112] * XMM1[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
//  RBX [R]  ST1  PMULHUW,       // ([RBX][15:0] * ST1[15:0]) / (2^16)
//                                   -> ST1[15:0]
//                               // ([RBX][31:16] * ST1[31:16]) / (2^16)
//                                   -> ST1[31:16]
//                               // ([RBX][47:32] * ST1[47:32]) / (2^16)
//                                   -> ST1[47:32]
//                               // ([RBX][63:48] * ST1[63:48]) / (2^16)
//                                   -> ST1[63:48]
//
//  XMM2  XMM1  PMULHUW,         // (XMM2[15:0] * XMM1[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                               // (XMM2[31:16] * XMM1[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                               // (XMM2[47:32] * XMM1[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                               // (XMM2[63:48] * XMM1[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                               // (XMM2[79:64] * XMM1[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                               // (XMM2[95:80] * XMM1[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                               // (XMM2[111:96] * XMM1[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                               // (XMM2[127:112] * XMM1[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
//  ST2  ST1  PMULHUW,           // (ST2[15:0] * ST1[15:0]) / (2^16)
//                                   -> ST1[15:0]
//                               // (ST2[31:16] * ST1[31:16]) / (2^16)
//                                   -> ST1[31:16]
//                               // (ST2[47:32] * ST1[47:32]) / (2^16)
//                                   -> ST1[47:32]
//                               // (ST2[63:48] * ST1[63:48]) / (2^16)
//                                   -> ST1[63:48]
//
//  XMM1 <-  XMM2  PMULHUW,   // (XMM2[15:0] * XMM1[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                               // (XMM2[31:16] * XMM1[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                               // (XMM2[47:32] * XMM1[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                               // (XMM2[63:48] * XMM1[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                               // (XMM2[79:64] * XMM1[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                               // (XMM2[95:80] * XMM1[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                               // (XMM2[111:96] * XMM1[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                               // (XMM2[127:112] * XMM1[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmulhuwcomma ( VPMULHUW, )
//
// C prototype:
//  void dg_forthvpmulhuwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMULHUW instruction. This sequence multiplies each 16 bit unsigned
//   integer in the source with the corresponding 16 but unsigned integer in the
//   destination and puts the upper 16 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMULHUW,      
//                               // ([RBX][15:0] * XMM0[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                               // ([RBX][31:16] * XMM0[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                               // ([RBX][47:32] * XMM0[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                               // ([RBX][63:48] * XMM0[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                               // ([RBX][79:64] * XMM0[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                               // ([RBX][95:80] * XMM0[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                               // ([RBX][111:96] * XMM0[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                               // ([RBX][127:112] * XMM0[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMULHUW,  // (YMM2[15:0] * YMM0[15:0]) / (2^16)
//                                   -> YMM1[15:0]
//                               // (YMM2[31:16] * YMM0[31:16]) / (2^16)
//                                   -> YMM1[31:16]
//                               // (YMM2[47:32] * YMM0[47:32]) / (2^16)
//                                   -> YMM1[47:32]
//                               // (YMM2[63:48] * YMM0[63:48]) / (2^16)
//                                   -> YMM1[63:48]
//                               // (YMM2[79:64] * YMM0[79:64]) / (2^16)
//                                   -> YMM1[79:64]
//                               // (YMM2[95:80] * YMM0[95:80]) / (2^16)
//                                   -> YMM1[95:80]
//                               // (YMM2[111:96] * YMM0[111:96]) / (2^16)
//                                   -> YMM1[111:96]
//                               // (YMM2[127:112] * YMM0[127:112]) / (2^16)
//                                   -> YMM1[127:112]
//                               // ...
//                               // (YMM2[255:240] * YMM0[255:240]) / (2^16)
//                                   -> YMM1[255:240]
//
//  XMM1 <-  XMM0 XMM2  VPMULHUW,   
//                               // (XMM2[15:0] * XMM0[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                               // (XMM2[31:16] * XMM0[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                               // (XMM2[47:32] * XMM0[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                               // (XMM2[63:48] * XMM0[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                               // (XMM2[79:64] * XMM0[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                               // (XMM2[95:80] * XMM0[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                               // (XMM2[111:96] * XMM0[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                               // (XMM2[127:112] * XMM0[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmulhwcomma ( PMULHW, )
//
// C prototype:
//  void dg_forthpmulhwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMULHW instruction. This sequence multiplies each 16 bit signed
//   integer in the source with the corresponding 16 but signed integer in the
//   destination and puts the upper 16 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMULHW,      // ([RBX][15:0] * XMM1[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                              // ([RBX][31:16] * XMM1[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                              // ([RBX][47:32] * XMM1[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                              // ([RBX][63:48] * XMM1[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                              // ([RBX][79:64] * XMM1[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                              // ([RBX][95:80] * XMM1[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                              // ([RBX][111:96] * XMM1[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                              // ([RBX][127:112] * XMM1[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
//  RBX [R]  ST1  PMULHW,      // ([RBX][15:0] * ST1[15:0]) / (2^16)
//                                   -> ST1[15:0]
//                              // ([RBX][31:16] * ST1[31:16]) / (2^16)
//                                   -> ST1[31:16]
//                              // ([RBX][47:32] * ST1[47:32]) / (2^16)
//                                   -> ST1[47:32]
//                              // ([RBX][63:48] * ST1[63:48]) / (2^16)
//                                   -> ST1[63:48]
//
//  XMM2  XMM1  PMULHW,         // (XMM2[15:0] * XMM1[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                              // (XMM2[31:16] * XMM1[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                              // (XMM2[47:32] * XMM1[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                              // (XMM2[63:48] * XMM1[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                              // (XMM2[79:64] * XMM1[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                              // (XMM2[95:80] * XMM1[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                              // (XMM2[111:96] * XMM1[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                              // (XMM2[127:112] * XMM1[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
//  ST2  ST1  PMULHW,           // (ST2[15:0] * ST1[15:0]) / (2^16)
//                                   -> ST1[15:0]
//                              // (ST2[31:16] * ST1[31:16]) / (2^16)
//                                   -> ST1[31:16]
//                              // (ST2[47:32] * ST1[47:32]) / (2^16)
//                                   -> ST1[47:32]
//                              // (ST2[63:48] * ST1[63:48]) / (2^16)
//                                   -> ST1[63:48]
//
//  XMM1 <-  XMM2  PMULHW,   // (XMM2[15:0] * XMM1[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                              // (XMM2[31:16] * XMM1[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                              // (XMM2[47:32] * XMM1[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                              // (XMM2[63:48] * XMM1[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                              // (XMM2[79:64] * XMM1[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                              // (XMM2[95:80] * XMM1[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                              // (XMM2[111:96] * XMM1[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                              // (XMM2[127:112] * XMM1[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmulhwcomma ( VPMULHW, )
//
// C prototype:
//  void dg_forthvpmulhwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMULHW instruction. This sequence multiplies each 16 bit signed
//   integer in the source with the corresponding 16 but signed integer in
//   target y and puts the upper 16 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMULHW,     
//                              // ([RBX][15:0] * XMM0[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                              // ([RBX][31:16] * XMM0[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                              // ([RBX][47:32] * XMM0[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                              // ([RBX][63:48] * XMM0[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                              // ([RBX][79:64] * XMM0[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                              // ([RBX][95:80] * XMM0[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                              // ([RBX][111:96] * XMM0[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                              // ([RBX][127:112] * XMM0[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
//
//  YMM2  YMM0  YMM1  VPMULHW,        
//                              // (YMM2[15:0] * YMM0[15:0]) / (2^16)
//                                   -> YMM1[15:0]
//                              // (YMM2[31:16] * YMM0[31:16]) / (2^16)
//                                   -> YMM1[31:16]
//                              // (YMM2[47:32] * YMM0[47:32]) / (2^16)
//                                   -> YMM1[47:32]
//                              // (YMM2[63:48] * YMM0[63:48]) / (2^16)
//                                   -> YMM1[63:48]
//                              // (YMM2[79:64] * YMM0[79:64]) / (2^16)
//                                   -> YMM1[79:64]
//                              // (YMM2[95:80] * YMM0[95:80]) / (2^16)
//                                   -> YMM1[95:80]
//                              // (YMM2[111:96] * YMM0[111:96]) / (2^16)
//                                   -> YMM1[111:96]
//                              // (YMM2[127:112] * YMM0[127:112]) / (2^16)
//                                   -> YMM1[127:112]
//                              // ...
//                              // (YMM2[255:240] * YMM0[255:240]) / (2^16)
//                                   -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPMULHW,  
//                              // (XMM2[15:0] * XMM0[15:0]) / (2^16)
//                                   -> XMM1[15:0]
//                              // (XMM2[31:16] * XMM0[31:16]) / (2^16)
//                                   -> XMM1[31:16]
//                              // (XMM2[47:32] * XMM0[47:32]) / (2^16)
//                                   -> XMM1[47:32]
//                              // (XMM2[63:48] * XMM0[63:48]) / (2^16)
//                                   -> XMM1[63:48]
//                              // (XMM2[79:64] * XMM0[79:64]) / (2^16)
//                                   -> XMM1[79:64]
//                              // (XMM2[95:80] * XMM0[95:80]) / (2^16)
//                                   -> XMM1[95:80]
//                              // (XMM2[111:96] * XMM0[111:96]) / (2^16)
//                                   -> XMM1[111:96]
//                              // (XMM2[127:112] * XMM0[127:112]) / (2^16)
//                                   -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmulldcomma ( PMULLD, )
//
// C prototype:
//  void dg_forthpmulldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMULLD instruction. This sequence multiplies each 32 bit signed
//   integer in the source with the corresponding 32 but signed integer in the
//   destination and puts the lower 32 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMULLD,     // ([RBX][31:0] * XMM1[31:0]) -> XMM1[31:0]
//                             // ([RBX][63:32] * XMM1[63:32]) -> XMM1[63:32]
//                             // ([RBX][95:64] * XMM1[95:64]) -> XMM1[95:64]
//                             // ([RBX][127:96] * XMM1[127:96]) -> XMM1[127:96]
//
//  RBX [R]  ST1  PMULLD,      // ([RBX][31:0] * ST1[31:0]) -> ST1[31:0]
//                             // ([RBX][63:32] * ST1[63:32]) -> ST1[63:32]
//
//  XMM2  XMM1  PMULLD,        // (XMM2[31:0] * XMM1[31:0]) -> XMM1[31:0]
//                             // (XMM2[63:32] * XMM1[63:32]) -> XMM1[63:32]
//                             // (XMM2[95:64] * XMM1[95:64]) -> XMM1[95:64]
//                             // (XMM2[127:96] * XMM1[127:96]) -> XMM1[127:96]
//
//  ST2  ST1  PMULLD,          // (ST2[31:0] * ST1[31:0]) -> ST1[31:0]
//                             // (ST2[63:32] * ST1[63:32]) -> ST1[63:32]
//
//  XMM1 <-  XMM2  PMULLD,  // (XMM2[31:0] * XMM1[31:0]) -> XMM1[31:0]
//                             // (XMM2[63:32] * XMM1[63:32]) -> XMM1[63:32]
//                             // (XMM2[95:64] * XMM1[95:64]) -> XMM1[95:64]
//                             // (XMM2[127:96] * XMM1[127:96]) -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmulldcomma ( VPMULLD, )
//
// C prototype:
//  void dg_forthvpmulldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMULLD instruction. This sequence multiplies each 32 bit signed
//   integer in the source with the corresponding 32 but signed integer in 
//   target y and puts the lower 32 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMULLD,     
//                             // ([RBX][31:0] * XMM0[31:0]) -> XMM1[31:0]
//                             // ([RBX][63:32] * XMM0[63:32]) -> XMM1[63:32]
//                             // ([RBX][95:64] * XMM0[95:64]) -> XMM1[95:64]
//                             // ([RBX][127:96] * XMM0[127:96]) -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPMULLD, // (YMM2[31:0]   * YMM0[31:0])   -> YMM1[31:0]
//                             // (YMM2[63:32]  * YMM0[63:32])  -> YMM1[63:32]
//                             // (YMM2[95:64]  * YMM0[95:64])  -> YMM1[95:64]
//                             // (YMM2[127:96] * YMM0[127:96]) -> YMM1[127:96]
//                             // ...
//                             // (YMM2[255:224] * YMM0[255:224]) -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPMULLD,  
//                             // (XMM2[31:0] * XMM0[31:0]) -> XMM1[31:0]
//                             // (XMM2[63:32] * XMM0[63:32]) -> XMM1[63:32]
//                             // (XMM2[95:64] * XMM0[95:64]) -> XMM1[95:64]
//                             // (XMM2[127:96] * XMM0[127:96]) -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmullwcomma ( PMULLW, )
//
// C prototype:
//  void dg_forthpmullwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMULLW instruction. This sequence multiplies each 16 bit signed
//   integer in the source with the corresponding 16 but signed integer in the
//   destination and puts the lower 16 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMULLW,     // ([RBX][15:0] * XMM1[15:0]) -> XMM1[15:0]
//                             // ([RBX][31:16] * XMM1[31:16]) -> XMM1[31:16]
//                             // ([RBX][47:32] * XMM1[47:32]) -> XMM1[47:32]
//                             // ([RBX][63:48] * XMM1[63:48]) -> XMM1[63:48]
//                             // ([RBX][79:64] * XMM1[79:64]) -> XMM1[79:64]
//                             // ([RBX][95:80] * XMM1[95:80]) -> XMM1[95:80]
//                             // ([RBX][111:96] * XMM1[111:96]) -> XMM1[111:96]
//                             // ([RBX][127:112] * XMM1[127:112]) -> XMM1[127:112]
//
//  RBX [R]  ST1  PMULLW,      // ([RBX][15:0] * ST1[15:0]) -> ST1[15:0]
//                             // ([RBX][31:16] * ST1[31:16]) -> ST1[31:16]
//                             // ([RBX][47:32] * ST1[47:32]) -> ST1[47:32]
//                             // ([RBX][63:48] * ST1[63:48]) -> ST1[63:48]
//
//  XMM2  XMM1  PMULLW,        // (XMM2[15:0] * XMM1[15:0]) -> XMM1[15:0]
//                             // (XMM2[31:16] * XMM1[31:16]) -> XMM1[31:16]
//                             // (XMM2[47:32] * XMM1[47:32]) -> XMM1[47:32]
//                             // (XMM2[63:48] * XMM1[63:48]) -> XMM1[63:48]
//                             // (XMM2[79:64] * XMM1[79:64]) -> XMM1[79:64]
//                             // (XMM2[95:80] * XMM1[95:80]) -> XMM1[95:80]
//                             // (XMM2[111:96] * XMM1[111:96]) -> XMM1[111:96]
//                             // (XMM2[127:112] * XMM1[127:112]) -> XMM1[127:112]
//
//  ST2  ST1  PMULLW,          // (ST2[15:0] * ST1[15:0]) -> ST1[15:0]
//                             // (ST2[31:16] * ST1[31:16]) -> ST1[31:16]
//                             // (ST2[47:32] * ST1[47:32]) -> ST1[47:32]
//                             // (ST2[63:48] * ST1[63:48]) -> ST1[63:48]
//
//  XMM1 <-  XMM2  PMULLW,  // (XMM2[15:0] * XMM1[15:0]) -> XMM1[15:0]
//                             // (XMM2[31:16] * XMM1[31:16]) -> XMM1[31:16]
//                             // (XMM2[47:32] * XMM1[47:32]) -> XMM1[47:32]
//                             // (XMM2[63:48] * XMM1[63:48]) -> XMM1[63:48]
//                             // (XMM2[79:64] * XMM1[79:64]) -> XMM1[79:64]
//                             // (XMM2[95:80] * XMM1[95:80]) -> XMM1[95:80]
//                             // (XMM2[111:96] * XMM1[111:96]) -> XMM1[111:96]
//                             // (XMM2[127:112] * XMM1[127:112]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmullwcomma ( VPMULLW, )
//
// C prototype:
//  void dg_forthvpmullwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMULLW instruction. This sequence multiplies each 16 bit signed
//   integer in the source with the corresponding 16 but signed integer in
//   target y and puts the lower 16 bits of each result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMULLW,     
//                             // ([RBX][15:0] * XMM0[15:0]) -> XMM1[15:0]
//                             // ([RBX][31:16] * XMM0[31:16]) -> XMM1[31:16]
//                             // ([RBX][47:32] * XMM0[47:32]) -> XMM1[47:32]
//                             // ([RBX][63:48] * XMM0[63:48]) -> XMM1[63:48]
//                             // ([RBX][79:64] * XMM0[79:64]) -> XMM1[79:64]
//                             // ([RBX][95:80] * XMM0[95:80]) -> XMM1[95:80]
//                             // ([RBX][111:96] * XMM0[111:96]) -> XMM1[111:96]
//                             // ([RBX][127:112] * XMM0[127:112]) -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPMULLW, // (YMM2[15:0] * YMM0[15:0]) -> YMM1[15:0]
//                             // (YMM2[31:16] * YMM0[31:16]) -> YMM1[31:16]
//                             // (YMM2[47:32] * YMM0[47:32]) -> YMM1[47:32]
//                             // (YMM2[63:48] * YMM0[63:48]) -> YMM1[63:48]
//                             // (YMM2[79:64] * YMM0[79:64]) -> YMM1[79:64]
//                             // (YMM2[95:80] * YMM0[95:80]) -> YMM1[95:80]
//                             // (YMM2[111:96] * YMM0[111:96]) -> YMM1[111:96]
//                             // (YMM2[127:112] * YMM0[127:112]) -> YMM1[127:112]
//                             // ...
//                             // (YMM2[255:240] * YMM0[255:240]) -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPMULLW,  
//                             // (XMM2[15:0] * XMM0[15:0]) -> XMM1[15:0]
//                             // (XMM2[31:16] * XMM0[31:16]) -> XMM1[31:16]
//                             // (XMM2[47:32] * XMM0[47:32]) -> XMM1[47:32]
//                             // (XMM2[63:48] * XMM0[63:48]) -> XMM1[63:48]
//                             // (XMM2[79:64] * XMM0[79:64]) -> XMM1[79:64]
//                             // (XMM2[95:80] * XMM0[95:80]) -> XMM1[95:80]
//                             // (XMM2[111:96] * XMM0[111:96]) -> XMM1[111:96]
//                             // (XMM2[127:112] * XMM0[127:112]) -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmuludqcomma ( PMULUDQ, )
//
// C prototype:
//  void dg_forthpmuludqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMULUDQ instruction. 
//   This sequence multiplies each 32 bit unsigned integer in the lower 32 bits 
//   of each 64 bit section of the source by each 32 bit unsigned integer in the 
//   lower 32 bits of each 64 bit section of target y and puts the 64 bit results 
//   into each 64 bit section of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMULUDQ,    // ([RBX][31:0] * XMM1[31:0]) -> XMM1[63:0]
//                             // ([RBX][95:64] * XMM1[95:64]) -> XMM1[127:64]
//
//  RBX [R]  ST1  PMULUDQ,     // ([RBX][31:0] * ST1[31:0]) -> ST1[63:0]
//
//  XMM2  XMM1  PMULUDQ,       // (XMM2[31:0] * XMM1[31:0]) -> XMM1[63:0]
//                             // (XMM2[95:64] * XMM1[95:64]) -> XMM1[127:64]
//
//  ST2  ST1  PMULUDQ,         // (ST2[31:0] * ST1[31:0]) -> ST1[63:0]
//
//  XMM1 <-  XMM2  PMULUDQ, // (XMM2[31:0] * XMM1[31:0]) -> XMM1[63:0]
//                             // (XMM2[95:64] * XMM1[95:64]) -> XMM1[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmuludqcomma ( VPMULUDQ, )
//
// C prototype:
//  void dg_forthvpmuludqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPMULUDQ instruction. 
//   This sequence multiplies each 32 bit unsigned integer in the lower 32 bits 
//   of each 64 bit section of the source by each 32 bit unsigned integer in the 
//   lower 32 bits of each 64 bit section of target y and puts the 64 bit results 
//   into each 64 bit section of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPMULUDQ,    
//                             // ([RBX][31:0] * XMM0[31:0]) -> XMM1[63:0]
//                             // ([RBX][95:64] * XMM0[95:64]) -> XMM1[127:64]
//
//  YMM2  YMM0  YMM1  VPMULUDQ,       
//                             // (YMM2[31:0] * YMM0[31:0]) -> YMM1[63:0]
//                             // (YMM2[95:64] * YMM0[95:64]) -> YMM1[127:64]
//                             // (YMM2[159:128] * YMM0[159:128]) -> YMM1[195:128]
//                             // (YMM2[233:196] * YMM0[223:196]) -> YMM1[255:196]
//
//  XMM1 <-  XMM0  XMM2  VPMULUDQ, 
//                             // (XMM2[31:0] * XMM0[31:0]) -> XMM1[63:0]
//                             // (XMM2[95:64] * XMM0[95:64]) -> XMM1[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthporcomma ( POR, )
//
// C prototype:
//  void dg_forthporcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 POR instruction. This sequence does a binary or of the two targets
//   and puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  POR,    // [RBX][127:0] or XMM1[127:0] -> XMM1[127:0]
//
//  RBX [R]  ST1  POR,     // [RBX][63:0] or ST1[63:0] -> ST1[63:0]
//
//  XMM2  XMM1  POR,       // XMM2[127:0] or XMM1[127:0] -> XMM1[127:0]
//
//  ST2  ST1  POR,         // ST2[63:0] or ST1[63:0] -> ST1[63:0]
//
//  XMM1 <-  XMM2  POR, // XMM2[127:0] or XMM1[127:0] -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvporcomma ( VPOR, )
//
// C prototype:
//  void dg_forthvporcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPOR instruction. This sequence does a binary or of target y
//   with the source target and puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPOR,    // [RBX][127:0] or XMM0[127:0] -> XMM1[127:0]
//
//  YMM2  YMM0  YMM1  VPOR,       // XMM2[255:0] or XMM0[255:0] -> XMM1[255:0]
//
//  XMM1 <-  XMM0  XMM2  VPOR, // XMM2[127:0] or XMM0[127:0] -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpshufbcomma ( PSHUFB, )
//
// C prototype:
//  void dg_forthpshufbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSHUFB instruction. This sequence either copies bytes from the
//   destination back to the destination or clears bytes in the destination
//   based on the value of bytes in the source. If the high bit of a byte in
//   the source is set, the corresponding byte in the destination is cleared.
//   If the high bit of a byte in the source is clear, the lower 3 (fpreg) or
//   4 bits (xmmreg) in each byte of the source is an index representing which
//   destination byte is copied back to the corresponding destination byte.
//   For an example using an XMM register as the destination:
//    if      dest = 0xFFEEDDCCBBAA99887766554433221100,
//    and   source = 0x0E020203030404800102030404058007 then
//    dest will have 0xEE222233334444001122334444550077 after.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSHUFB,    // if [RBX][7] is 1 then 0 -> XMM1[7:0]
//                            // if [RBX][7] is 0 then i0 = [RBX][3:0]
//                            //  XMM1[8*i0 + 7: 8*i0] -> XMM1[7:0]
//                            // if [RBX][15] is 1 then 0 -> XMM1[15:8]
//                            // if [RBX][15] is 0 then i1 = [RBX][11:8]
//                            //  XMM1[8*i1 + 7: 8*i1] -> XMM1[15:8]
//                            // ...
//                            // if [RBX][127] is 1 then 0 -> XMM1[127:120]
//                            // if [RBX][127] is 0 then i15 = [RBX][127:120]
//                            //  XMM1[8*i15 + 7: 8*i15] -> XMM1[127:120]
//
//  RBX [R]  ST1  PSHUFB,     // if [RBX][7] is 1 then 0 -> ST1[7:0]
//                            // if [RBX][7] is 0 then i0 = [RBX]][2:0]
//                            //  ST1[8*i0 + 7: 8*i0] -> ST1[7:0]
//                            // if [RBX][15] is 1 then 0 -> ST1[15:8]
//                            // if [RBX][15] is 0 then i1 = [RBX]][10:8]
//                            //  ST1[8*i1 + 7: 8*i1] -> ST1[15:8]
//                            // ...
//                            // if [RBX][63] is 1 then 0 -> XMM1[63:56]
//                            // if [RBX][63] is 0 then i15 = [RBX][58:56]
//                            //  XMM1[8*i15 + 7: 8*i15] -> XMM1[63:56]
//
//  XMM2  XMM1  PSHUFB,       // if XMM2[7] is 1 then 0 -> XMM1[7:0]
//                            // if XMM2[7] is 0 then i0 = XMM2[3:0]
//                            //  XMM1[8*i0 + 7: 8*i0] -> XMM1[7:0]
//                            // if XMM2[15] is 1 then 0 -> XMM1[15:8]
//                            // if XMM2[15] is 0 then i1 = XMM2[11:8]
//                            //  XMM1[8*i1 + 7: 8*i1] -> XMM1[15:8]
//                            // etc.
//
//  ST2  ST1  PSHUFB,         // if ST2[7] is 1 then 0 -> ST1[7:0]
//                            // if ST2[7] is 0 then i0 = ST2[2:0]
//                            //  ST1[8*i0 + 7: 8*i0] -> ST1[7:0]
//                            // if ST2[15] is 1 then 0 -> ST1[15:8]
//                            // if ST2[15] is 0 then i1 = ST2[10:8]
//                            //  ST1[8*i1 + 7: 8*i1] -> ST1[15:8]
//                            // etc.
//
//  XMM1 <-  XMM2  PSHUFB, // if XMM2[7] is 1 then 0 -> XMM1[7:0]
//                            // if XMM2[7] is 0 then i0 = XMM2[3:0]
//                            //  XMM1[8*i0 + 7: 8*i0] -> XMM1[7:0]
//                            // if XMM2[15] is 1 then 0 -> XMM1[15:8]
//                            // if XMM2[15] is 0 then i1 = XMM2[11:8]
//                            //  XMM1[8*i1 + 7: 8*i1] -> XMM1[15:8]
//                            // etc.
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpshufbcomma ( VPSHUFB, )
//
// C prototype:
//  void dg_forthvpshufbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSHUFB instruction. This sequence either copies bytes from target y
//   back to the destination or clears bytes in the destination
//   based on the value of bytes in the source. If the high bit of a byte in
//   the source is set, the corresponding byte in the destination is cleared.
//   If the high bit of a byte in the source is clear, the lower 4 bits in each 
//   byte of the source is an index representing which byte in each 128 bit
//   section of target y is copied back to the corresponding destination byte.
//   The bytes from the lower 128 bit section of target y are copied to the
//   lower 128 bit section of the destination. The bytes from the upper 128 bit
//   section of target y are copied to the upper 128 bit section of the
//   destination.
//   For an example using an XMM register as the destination:
//    if    target y = 0xFFEEDDCCBBAA99887766554433221100,
//    and   source =   0x0E020203030404800102030404058007 then
//    dest will have   0xEE222233334444001122334444550077 after.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSHUFB,    
//                            // if [RBX][7] is 1 then 0 -> XMM1[7:0]
//                            // if [RBX][7] is 0 then i0 = [RBX][3:0]
//                            //  XMM0[8*i0 + 7: 8*i0] -> XMM1[7:0]
//                            // if [RBX][15] is 1 then 0 -> XMM1[15:8]
//                            // if [RBX][15] is 0 then i1 = [RBX][11:8]
//                            //  XMM0[8*i1 + 7: 8*i1] -> XMM1[15:8]
//                            // ...
//                            // if [RBX][127] is 1 then 0 -> XMM1[127:120]
//                            // if [RBX][127] is 0 then i15 = [RBX][127:120]
//                            //  XMM0[8*i15 + 7: 8*i15] -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPSHUFB,       
//                            // if YMM2[7] is 1 then 0 -> YMM1[7:0]
//                            // if YMM2[7] is 0 then i0 = YMM2[3:0]
//                            //  YMM0[8*i0 + 7: 8*i0] -> YMM1[7:0]
//                            // if YMM2[15] is 1 then 0 -> XMM1[15:8]
//                            // if YMM2[15] is 0 then i1 = YMM2[11:8]
//                            //  YMM0[8*i1 + 7: 8*i1] -> XMM1[15:8]
//                            // ...
//                            // if YMM2[255] is 1 then 0 -> YMM1[255:247]
//                            // if YMM2[255] is 0 then i15 = YMM2[250:247]
//                            //  YMM0[8*i15 + 135: 8*i15 + 128] 
//                            //   -> YMM1[255:247]
//
//  XMM1 <-  XMM0  XMM2  VPSHUFB, 
//                            // if XMM2[7] is 1 then 0 -> XMM1[7:0]
//                            // if XMM2[7] is 0 then i0 = XMM2[3:0]
//                            //  XMM0[8*i0 + 7: 8*i0] -> XMM1[7:0]
//                            // if XMM2[15] is 1 then 0 -> XMM1[15:8]
//                            // if XMM2[15] is 0 then i1 = XMM2[11:8]
//                            //  XMM1[8*i1 + 7: 8*i1] -> XMM1[15:8]
//                            // XMM0.
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsignbcomma ( PSIGNB, )
//
// C prototype:
//  void dg_forthpsignbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSIGNB instruction. The sequence negates, clears, or leaves alone
//   the signed byte values in the destination based on the signed byte values
//   in the source. If the high bit in source's value is 1, then the destination
//   value is negated. If the source value is zero, the destination value is
//   cleared. Otherwise the destination value is unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSIGNB,    // if [RBX][7] is 1 then
//                            //  0 - XMM1[7:0] -> XMM1[7:0]
//                            // if [RBX][7:0] is 0 then 0 -> XMM1[7:0]
//                            // if [RBX][15] is 1 then
//                            //  0 - XMM1[15:8] -> XMM1[15:8]
//                            // if [RBX][15:8] is 0 then 0 -> XMM1[15:8]
//                            // ...
//                            // if [RBX][127] is 1 then
//                            //  0 - XMM1[127:120] -> XMM1[127:120]
//                            // if [RBX][127:120] is 0 then 0 -> XMM1[127:120]
//
//  RBX [R]  ST1  PSIGNB,     // if [RBX][7] is 1 then
//                            //  0 - ST1[7:0] -> ST1[7:0]
//                            // if [RBX][7:0] is 0 then 0 -> ST1[7:0]
//                            // if [RBX][15] is 1 then
//                            //  0 - ST1[15:8] -> ST1[15:8]
//                            // if [RBX][15:8] is 0 then 0 -> ST1[15:8]
//                            // ...
//                            // if [RBX][63] is 1 then
//                            //  0 - ST1[63:56] -> ST1[63:56]
//                            // if [RBX][63:56] is 0 then 0 -> ST1[63:56]
//
//  XMM2  XMM1  PSIGNB,       // if XMM2[7] is 1 then
//                            //  0 - XMM1[7:0] -> XMM1[7:0]
//                            // if XMM2[7:0] is 0 then 0 -> XMM1[7:0]
//                            // if XMM2[15] is 1 then
//                            //  0 - XMM1[15:8] -> XMM1[15:8]
//                            // if XMM2[15:8] is 0 then 0 -> XMM1[15:8]
//                            // ...
//                            // if XMM2[127] is 1 then
//                            //  0 - XMM1[127:120] -> XMM1[127:120]
//                            // if XMM2[127:120] is 0 then 0 -> XMM1[127:120]
//
//  ST2  ST1  PSIGNB,         // if ST2[7] is 1 then
//                            //  0 - ST1[7:0] -> ST1[7:0]
//                            // if ST2[7:0] is 0 then 0 -> ST1[7:0]
//                            // if ST2[15] is 1 then
//                            //  0 - ST1[15:8] -> ST1[15:8]
//                            // if ST2[15:8] is 0 then 0 -> ST1[15:8]
//                            // ...
//                            // if ST2[63] is 1 then
//                            //  0 - ST1[63:56] -> ST1[63:56]
//                            // if ST2[63:56] is 0 then 0 -> ST1[63:56]
//
//  XMM1 <-  XMM2  PSIGNB, // if XMM2[7] is 1 then
//                            //  0 - XMM1[7:0] -> XMM1[7:0]
//                            // if XMM2[7:0] is 0 then 0 -> XMM1[7:0]
//                            // if XMM2[15] is 1 then
//                            //  0 - XMM1[15:8] -> XMM1[15:8]
//                            // if XMM2[15:8] is 0 then 0 -> XMM1[15:8]
//                            // ...
//                            // if XMM2[127] is 1 then
//                            //  0 - XMM1[127:120] -> XMM1[127:120]
//                            // if XMM2[127:120] is 0 then 0 -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsignbcomma ( VPSIGNB, )
//
// C prototype:
//  void dg_forthvpsignbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSIGNB instruction. The sequence negates, clears, or leaves alone
//   the signed byte values in target y based on the signed byte values
//   in the source and puts the results into the destination.
//   If the high bit in source's value is 1, then the target y
//   value is negated. If the source value is zero, the target y value is
//   cleared. Otherwise the target y value is moved to the destination
//   unchanged. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSIGNB,    
//                            // if [RBX][7:0] is 0 then 0 -> XMM1[7:0]
//                            // else if [RBX][7] is 1 then
//                            //  0 - XMM0[7:0] -> XMM1[7:0]
//                            // else XMM0[7:0] -> XMM1[7:0]
//                            // if [RBX][15:8] is 0 then 0 -> XMM1[15:8]
//                            // else if [RBX][15] is 1 then
//                            //  0 - XMM0[15:8] -> XMM1[15:8]
//                            // else XMM0[15:8] -> XMM1[15:8]
//                            // ...
//                            // if [RBX][127:120] is 0 then 0 -> XMM1[127:120]
//                            // else if [RBX][127] is 1 then
//                            //  0 - XMM0[127:120] -> XMM1[127:120]
//                            // else XMM0[127:120] -> XMM1[127:120]
//                            
//
//  YMM2  YMM0  YMM1  VPSIGNB,     
//                            // if YMM2[7:0] is 0 then 0 -> YMM1[7:0]  
//                            // else if YMM2[7] is 1 then
//                            //  0 - YMM0[7:0] -> YMM1[7:0]
//                            // else YMM0[7:0] -> YMM1[7:0]
//                            // if YMM2[15:8] is 0 then 0 -> YMM1[15:8]
//                            // else if YMM2[15] is 1 then
//                            //  0 - YMM0[15:8] -> YMM1[15:8]
//                            // else YMM0[15:8] -> YMM1[15:8]
//                            // ...
//                            // if YMM2[255:247] is 0 then 0 -> YMM1[255:247]
//                            // else if YMM2[255] is 1 then
//                            //  0 - YMM0[255:247] -> YMM1[255:247]
//                            // else YMM0[255:247] -> YMM1[255:247]
//
//  XMM1 <-  XMM0  XMM2  VPSIGNB, 
//                            // if XMM2[7:0] is 0 then 0 -> XMM1[7:0]
//                            // else if XMM2[7] is 1 then
//                            //  0 - XMM0[7:0] -> XMM1[7:0]
//                            // else XMM0[7:0] -> XMM1[7:0]
//                            // if XMM2[15:8] is 0 then 0 -> XMM1[15:8]
//                            // else if XMM2[15] is 1 then
//                            //  0 - XMM0[15:8] -> XMM1[15:8]
//                            // else XMM0[15:8] -> XMM1[15:8]
//                            // ...
//                            // if XMM2[127:120] is 0 then 0 -> XMM1[127:120]
//                            // else if XMM2[127] is 1 then
//                            //  0 - XMM0[127:120] -> XMM1[127:120]
//                            // else XMM0[127:120] -> XMM1[127:120]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsigndcomma ( PSIGND, )
//
// C prototype:
//  void dg_forthpsigndcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSIGND instruction. The sequence negates, clears, or leaves alone
//   the signed 32 bit values in the destination based on the signed 32 bit values
//   in the source. If the high bit in the source's value is 1, then the destination
//   value is negated. If the source value is zero, the destination value is
//   cleared. Otherwise the destination value is unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSIGND,    // if [RBX][31] is 1 then
//                            //  0 - XMM1[31:0] -> XMM1[31:0]
//                            // if [RBX][31:0] is 0 then 0 -> XMM1[31:0]
//                            // if [RBX][63] is 1 then
//                            //  0 - XMM1[63:32] -> XMM1[63:32]
//                            // if [RBX][63:32] is 0 then 0 -> XMM1[63:32]
//                            // ...
//                            // if [RBX][127] is 1 then
//                            //  0 - XMM1[127:96] -> XMM1[127:96]
//                            // if [RBX][127:96] is 0 then 0 -> XMM1[127:96]
//
//  RBX [R]  ST1  PSIGND,     // if [RBX][31] is 1 then
//                            //  0 - ST1[31:0] -> ST1[31:0]
//                            // if [RBX][31:0] is 0 then 0 -> ST1[31:0]
//                            // if [RBX][63] is 1 then
//                            //  0 - ST1[63:32] -> ST1[63:32]
//                            // if [RBX][63:32] is 0 then 0 -> ST1[63:32]
//
//  XMM2  XMM1  PSIGND,       // if XMM2[31] is 1 then
//                            //  0 - XMM1[31:0] -> XMM1[31:0]
//                            // if XMM2[31:0] is 0 then 0 -> XMM1[31:0]
//                            // if XMM2[63] is 1 then
//                            //  0 - XMM1[63:32] -> XMM1[63:32]
//                            // if XMM2[63:32] is 0 then 0 -> XMM1[63:32]
//                            // ...
//                            // if XMM2[127] is 1 then
//                            //  0 - XMM1[127:96] -> XMM1[127:96]
//                            // if XMM2[127:96] is 0 then 0 -> XMM1[127:96]
//
//  ST2  ST1  PSIGND,         // if ST2[31] is 1 then
//                            //  0 - ST1[31:0] -> ST1[31:0]
//                            // if ST2[31:0] is 0 then 0 -> ST1[31:0]
//                            // if ST2[63] is 1 then
//                            //  0 - ST1[63:32] -> ST1[63:32]
//                            // if ST2[63:32] is 0 then 0 -> ST1[63:32]
//
//  XMM1 <-  XMM2  PSIGND, // if XMM2[31] is 1 then
//                            //  0 - XMM1[31:0] -> XMM1[31:0]
//                            // if XMM2[31:0] is 0 then 0 -> XMM1[31:0]
//                            // if XMM2[63] is 1 then
//                            //  0 - XMM1[63:32] -> XMM1[63:32]
//                            // if XMM2[63:32] is 0 then 0 -> XMM1[63:32]
//                            // ...
//                            // if XMM2[127] is 1 then
//                            //  0 - XMM1[127:96] -> XMM1[127:96]
//                            // if XMM2[127:96] is 0 then 0 -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsigndcomma ( VPSIGND, )
//
// C prototype:
//  void dg_forthvpsigndcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSIGND instruction. The sequence negates, clears, or leaves alone
//   the signed 32 bit values in target y based on the signed 32 bit values
//   in the source and puts the results into the destination. If the high bit 
//   in the source's value is 1, then the target y value is negated. If the 
//   source value is zero, the target y value is cleared. Otherwise the 
//   target y value is moved to the destination unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSIGND, 
//                            // if [RBX][31:0] is 0 then 0 -> XMM1[31:0]  
//                            // else if [RBX][31] is 1 then
//                            //  0 - XMM0[31:0] -> XMM1[31:0]
//                            // else XMM0[31:0] -> XMM1[31:0]
//                            // if [RBX][63:32] is 0 then 0 -> XMM1[63:32]
//                            // else if [RBX][63] is 1 then
//                            //  0 - XMM0[63:32] -> XMM1[63:32]
//                            // else XMM0[63:32] -> XMM1[63:32]
//                            // ...
//                            // if [RBX][127:96] is 0 then 0 -> XMM1[127:96]
//                            // else if [RBX][127] is 1 then
//                            //  0 - XMM0[127:96] -> XMM1[127:96]
//                            // else XMM0[127:96] -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPSIGND,  
//                            // if YMM2[31:0] is 0 then 0 -> YMM1[31:0]    
//                            // else if YMM2[31] is 1 then
//                            //  0 - YMM0[31:0] -> YMM1[31:0]
//                            // else YMM0[31:0] -> YMM1[31:0]
//                            // if YMM2[63:32] is 0 then 0 -> YMM1[63:32]
//                            // else if YMM2[63] is 1 then
//                            //  0 - YMM0[63:32] -> YMM1[63:32]
//                            // else YMM0[63:32] -> YMM1[63:32]
//                            // ...
//                            // if YMM2[255:192] is 0 then 0 -> YMM1[255:192]
//                            // else if YMM2[255] is 1 then
//                            //  0 - YMM0[255:192] -> YMM1[255:192]
//                            // else YMM0[255:192] -> YMM1[255:192]
//
//  XMM1 <-  XMM0  XMM2  VPSIGND, 
//                            // if XMM2[31:0] is 0 then 0 -> XMM1[31:0]
///                           // else if XMM2[31] is 1 then
//                            //  0 - XMM0[31:0] -> XMM1[31:0]
//                            // else XMM0[31:0] -> XMM1[31:0]
//                            // if XMM2[63:32] is 0 then 0 -> XMM1[63:32]
//                            // else if XMM2[63] is 1 then
//                            //  0 - XMM0[63:32] -> XMM1[63:32]
//                            // else XMM0[63:32] -> XMM1[63:32]
//                            // ...
//                            // if XMM2[127:96] is 0 then 0 -> XMM1[127:96]
//                            // else if XMM2[127] is 1 then
//                            //  0 - XMM0[127:96] -> XMM1[127:96]
//                            // else XMM0[127:96] -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsignwcomma ( PSIGNW, )
//
// C prototype:
//  void dg_forthpsignwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSIGNW instruction. The sequence negates, clears, or leaves alone
//   the signed 16 bit values in the destination based on the signed 16 bit values
//   in the source. If the high bit in the source's value is 1, then the destination
//   value is negated. If the source value is zero, the destination value is
//   cleared. Otherwise the destination value is unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSIGNW,    // if [RBX][15] is 1 then
//                            //  0 - XMM1[15:0] -> XMM1[15:0]
//                            // if [RBX][15:0] is 0 then 0 -> XMM1[15:0]
//                            // if [RBX][31] is 1 then
//                            //  0 - XMM1[31:16] -> XMM1[31:16]
//                            // if [RBX][31:16] is 0 then 0 -> XMM1[31:16]
//                            // ...
//                            // if [RBX][127] is 1 then
//                            //  0 - XMM1[127:112] -> XMM1[127:112]
//                            // if [RBX][127:112] is 0 then 0 -> XMM1[127:112]
//
//  RBX [R]  ST1  PSIGNW,     // if [RBX][15] is 1 then
//                            //  0 - ST1[15:0] -> ST1[15:0]
//                            // if [RBX][15:0] is 0 then 0 -> ST1[15:0]
//                            // if [RBX][31] is 1 then
//                            //  0 - ST1[31:16] -> ST1[31:16]
//                            // if [RBX][31:16] is 0 then 0 -> ST1[31:16]
//                            // ...
//                            // if [RBX][63] is 1 then
//                            //  0 - ST1[63:48] -> ST1[63:48]
//                            // if [RBX][63:48] is 0 then 0 -> ST1[63:48]
//
//  XMM2  XMM1  PSIGNW,       // if XMM2[15] is 1 then
//                            //  0 - XMM1[15:0] -> XMM1[15:0]
//                            // if XMM2[15:0] is 0 then 0 -> XMM1[15:0]
//                            // if XMM2[31] is 1 then
//                            //  0 - XMM1[31:16] -> XMM1[31:16]
//                            // if XMM2[31:16] is 0 then 0 -> XMM1[31:16]
//                            // ...
//                            // if XMM2[127] is 1 then
//                            //  0 - XMM1[127:112] -> XMM1[127:112]
//                            // if XMM2[127:112] is 0 then 0 -> XMM1[127:112]
//
//  ST2  ST1  PSIGNW,         // if ST2[15] is 1 then
//                            //  0 - ST1[15:0] -> ST1[15:0]
//                            // if ST2[15:0] is 0 then 0 -> ST1[15:0]
//                            // if ST2[31] is 1 then
//                            //  0 - ST1[31:16] -> ST1[31:16]
//                            // if ST2[31:16] is 0 then 0 -> ST1[31:16]
//                            // ...
//                            // if ST2[63] is 1 then
//                            //  0 - ST1[63:48] -> ST1[63:48]
//                            // if ST2[63:48] is 0 then 0 -> ST1[63:48]
//
//  XMM1 <-  XMM2  PSIGNW, // if XMM2[15] is 1 then
//                            //  0 - XMM1[15:0] -> XMM1[15:0]
//                            // if XMM2[15:0] is 0 then 0 -> XMM1[15:0]
//                            // if XMM2[31] is 1 then
//                            //  0 - XMM1[31:16] -> XMM1[31:16]
//                            // if XMM2[31:16] is 0 then 0 -> XMM1[31:16]
//                            // ...
//                            // if XMM2[127] is 1 then
//                            //  0 - XMM1[127:112] -> XMM1[127:112]
//                            // if XMM2[127:112] is 0 then 0 -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsignwcomma ( VPSIGNW, )
//
// C prototype:
//  void dg_forthvpsignwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSIGNW instruction. The sequence negates, clears, or leaves alone
//   the signed 16 bit values in target y based on the signed 16 bit values
//   in the source and puts the results into the destination. If the high bit
//   in the source's value is 1, then the target y value is negated. 
//   If the source value is zero, the target y value is cleared. Otherwise the 
//   target y value is unchanged.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSIGNW, 
//                            // if [RBX][15:0] is 0 then 0 -> XMM1[15:0]  
//                            // else if [RBX][15] is 1 then
//                            //  0 - XMM0[15:0] -> XMM1[15:0]
//                            // else XMM0[15:0] -> XMM1[15:0]
//                            // if [RBX][31:16] is 0 then 0 -> XMM1[31:16]
//                            // else if [RBX][31] is 1 then
//                            //  0 - XMM0[31:16] -> XMM1[31:16]
//                            // else XMM0[31:16] -> XMM1[31:16]
//                            // ...
//                            // if [RBX][127:112] is 0 then 0 -> XMM1[127:112]
//                            // else if [RBX][127] is 1 then
//                            //  0 - XMM0[127:112] -> XMM1[127:112]
//                            // else XMM0[127:112] -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPSIGNW,  
//                            // if YMM2[15:0] is 0 then 0 -> YMM1[15:0]     
//                            // else if YMM2[15] is 1 then
//                            //  0 - YMM0[15:0] -> YMM1[15:0]
//                            // else YMM0[15:0] -> YMM1[15:0]
//                            // if YMM2[31:16] is 0 then 0 -> YMM1[31:16]
//                            // else if YMM2[31] is 1 then
//                            //  0 - YMM0[31:16] -> YMM1[31:16]
//                            // else YMM0[31:16] -> YMM1[31:16]
//                            // ...
//                            // if YMM2[255:240] is 0 then 0 -> YMM1[255:240]
//                            // else if YMM2[255] is 1 then
//                            //  0 - YMM0[255:240] -> YMM1[255:240]
//                            // else YMM0[255:240] -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPSIGNW, 
//                            // if XMM2[15:0] is 0 then 0 -> XMM1[15:0]
//                            // else if XMM2[15] is 1 then
//                            //  0 - XMM0[15:0] -> XMM1[15:0]
//                            // else XMM0[15:0] -> XMM1[15:0]
//                            // if XMM2[31:16] is 0 then 0 -> XMM1[31:16]
//                            // else if XMM2[31] is 1 then
//                            //  0 - XMM0[31:16] -> XMM1[31:16]
//                            // else XMM0[31:16] -> XMM1[31:16]
//                            // ...
//                            // if XMM2[127:112] is 0 then 0 -> XMM1[127:112]
//                            // else if XMM2[127] is 1 then
//                            //  0 - XMM0[127:112] -> XMM1[127:112]
//                            // else XMM0[127:112] -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubbcomma ( PSUBB, )
//
// C prototype:
//  void dg_forthpsubbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBB instruction. This sequence subtracts each byte integer in
//   the source from the corresponding byte integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBB,    // XMM1[7:0]     - [RBX][7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    - [RBX][15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   - [RBX][23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] - [RBX][127:120] -> XMM1[127:120]
//
//  RBX [R]  ST1  PSUBB,     // ST1[7:0]     - [RBX][7:0]     -> ST1[7:0]
//                           // ST1[15:8]    - [RBX][15:8]    -> ST1[15:8]
//                           // ST1[23:16]   - [RBX][23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   - [RBX][63:56]   -> ST1[63:56]
//
//  XMM2  XMM1  PSUBB,       // XMM1[7:0]     - XMM2[7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    - XMM2[15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   - XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] - XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PSUBB,         // ST1[7:0]     - ST2[7:0]     -> ST1[7:0]
//                           // ST1[15:8]    - ST2[15:8]    -> ST1[15:8]
//                           // ST1[23:16]   - ST2[23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   - ST2[63:56]   -> ST1[63:56]
//
//  XMM1 <-  XMM2  PSUBB, // XMM1[7:0]     - XMM2[7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    - XMM2[15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   - XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] - XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubdcomma ( PSUBD, )
//
// C prototype:
//  void dg_forthpsubdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBD instruction. This sequence subtracts each 32 bit integer in
//   the source from the corresponding 32 bit integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBD,    // XMM1[31:0]     - [RBX][31:0]     -> XMM1[31:0]
//                           // XMM1[63:32]    - [RBX][63:32]    -> XMM1[63:32]
//                           // XMM1[95:64]    - [RBX][95:64]    -> XMM1[95:64]
//                           // XMM1[127:96]   - [RBX][127:96]   -> XMM1[127:96]
//
//  RBX [R]  ST1  PSUBD,     // ST1[31:0]     - [RBX][31:0]     -> ST1[31:0]
//                           // ST1[63:32]    - [RBX][63:32]    -> ST1[63:32]
//
//  XMM2  XMM1  PSUBD,       // XMM1[31:0]     - XMM2[31:0]     -> XMM1[31:0]
//                           // XMM1[63:32]    - XMM2[63:32]    -> XMM1[63:32]
//                           // XMM1[95:64]    - XMM2[95:64]    -> XMM1[95:64]
//                           // XMM1[127:96]   - XMM2[127:96]   -> XMM1[127:96]
//
//  ST2  ST1  PSUBD,         // ST1[31:0]     - ST2[31:0]     -> ST1[31:0]
//                           // ST1[63:32]    - ST2[63:32]    -> ST1[63:32]
//
//  XMM1 <-  XMM2  PSUBD, // XMM1[31:0]     - XMM2[31:0]     -> XMM1[31:0]
//                           // XMM1[63:32]    - XMM2[63:32]    -> XMM1[63:32]
//                           // XMM1[95:64]    - XMM2[95:64]    -> XMM1[95:64]
//                           // XMM1[127:96]   - XMM2[127:96]   -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubqcomma ( PSUBQ, )
//
// C prototype:
//  void dg_forthpsubqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBQ instruction. For floating point destinations, this sequence
//   subtracts the 64 bit integer in the source from the 64 bit integer in the
//   destination and puts the result into the destination.
//   For xmm destinations, this sequence subtracts each 64 bit integer in
//   the source from the corresponding 64 bit integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBQ,    // XMM1[63:0]     - [RBX][63:0]     -> XMM1[63:0]
//                           // XMM1[127:64]   - [RBX][127:64]   -> XMM1[127:64]
//
//  RBX [R]  ST1  PSUBQ,     // ST1[63:0]     - [RBX][63:0]     -> ST1[63:0]
//
//  XMM2  XMM1  PSUBQ,       // XMM1[63:0]     - XMM2[63:0]     -> XMM1[63:0]
//                           // XMM1[127:64]   - XMM2[127:64]   -> XMM1[127:64]
//
//  ST2  ST1  PSUBQ,         // ST1[63:0]     - ST2[63:0]     -> ST1[63:0]
//
//  XMM1 <-  XMM2  PSUBQ, // XMM1[63:0]     - XMM2[63:0]     -> XMM1[63:0]
//                           // XMM1[127:64]   - XMM2[127:64]   -> XMM1[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubwcomma ( PSUBW, )
//
// C prototype:
//  void dg_forthpsubwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBW instruction. This sequence subtracts each 16 bit integer in
//   the source from the corresponding 16 bit integer in the destination and puts
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBW,    // XMM1[15:0]     - [RBX][15:0]     -> XMM1[15:0]
//                           // XMM1[31:16]    - [RBX][31:16]    -> XMM1[31:16]
//                           // XMM1[47:32]    - [RBX][47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM1[127:112] - [RBX][127:112]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PSUBW,     // ST1[15:0]     - [RBX][15:0]     -> ST1[15:0]
//                           // ST1[31:16]    - [RBX][31:16]    -> ST1[31:16]
//                           // ST1[47:32]    - [RBX][47:32]    -> ST1[47:32]
//                           // ST1[63:48]    - [RBX][63:48]    -> ST1[63:48]
//
//  XMM2  XMM1  PSUBW,       // XMM1[15:0]     - XMM2[15:0]     -> XMM1[15:0]
//                           // XMM1[31:16]    - XMM2[31:16]    -> XMM1[31:16]
//                           // XMM1[47:32]    - XMM2[47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM1[127:112] - XMM2[127:112]  -> XMM1[127:112]
//
//  ST2  ST1  PSUBW,         // ST1[15:0]     - ST2[15:0]     -> ST1[15:0]
//                           // ST1[31:16]    - ST2[31:16]    -> ST1[31:16]
//                           // ST1[47:32]    - ST2[47:32]    -> ST1[47:32]
//                           // ST1[63:48]    - ST2[63:48]    -> ST1[63:48]
//
//  XMM1 <-  XMM2  PSUBW, // XMM1[15:0]     - XMM2[15:0]     -> XMM1[15:0]
//                           // XMM1[31:16]    - XMM2[31:16]    -> XMM1[31:16]
//                           // XMM1[47:32]    - XMM2[47:32]    -> XMM1[47:32]
//                           // ...
//                           // XMM1[127:112] - XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubsbcomma ( PSUBSB, )
//
// C prototype:
//  void dg_forthpsubsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBSB instruction. This sequence subtracts each signed byte integer
//   in the source from the corresponding signed byte integer in the destination.
//   Then if the results exceed the range of a signed byte value, the results
//   are limited to the greatest or least possible signed byte values.
//   Then the results are put into the destination.
//   Greatest possible signed byte value is 127.
//   Least possible signed byte value is -128.
//   So if you do 127 - -1, you get 127, not -128.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBSB,   // XMM1[7:0]     - [RBX][7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    - [RBX][15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   - [RBX][23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] - [RBX][127:120] -> XMM1[127:120]
//
//  RBX [R]  ST1  PSUBSB,    // ST1[7:0]     - [RBX][7:0]     -> ST1[7:0]
//                           // ST1[15:8]    - [RBX][15:8]    -> ST1[15:8]
//                           // ST1[23:16]   - [RBX][23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   - [RBX][63:56]   -> ST1[63:56]
//
//  XMM2  XMM1  PSUBSB,      // XMM1[7:0]     - XMM2[7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    - XMM2[15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   - XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] - XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PSUBSB,        // ST1[7:0]     - ST2[7:0]     -> ST1[7:0]
//                           // ST1[15:8]    - ST2[15:8]    -> ST1[15:8]
//                           // ST1[23:16]   - ST2[23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   - ST2[63:56]   -> ST1[63:56]
//
//  XMM1 <-  XMM2  PSUBSB, // XMM1[7:0]     - XMM2[7:0]     -> XMM1[7:0]
//                            // XMM1[15:8]    - XMM2[15:8]    -> XMM1[15:8]
//                            // XMM1[23:16]   - XMM2[23:16]   -> XMM1[23:16]
//                            // ...
//                            // XMM1[127:120] - XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubswcomma ( PSUBSW, )
//
// C prototype:
//  void dg_forthpsubswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBSW instruction. This sequence subtracts each 16 bit integer in
//   the source from the corresponding 16 bit integer in the destination. Then,
//   if the result exceeds the range of a signed 16 bit value, the results are
//   limited to the greatest or least possible signed 16 bit  values.
//   Then the results are put into the destination.
//   Greatest possible signed 16 bit integer is HEX 7FFF.
//   Least possible signed 16 bit integer is HEX -8000.
//   So if you do 7FFF - -1, you get 7FFF, not -8000.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBSW,    // XMM1[15:0]     - [RBX][15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    - [RBX][31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    - [RBX][47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112] - [RBX][127:112]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PSUBSW,     // ST1[15:0]     - [RBX][15:0]     -> ST1[15:0]
//                            // ST1[31:16]    - [RBX][31:16]    -> ST1[31:16]
//                            // ST1[47:32]    - [RBX][47:32]    -> ST1[47:32]
//                            // ST1[63:48]    - [RBX][63:48]    -> ST1[63:48]
//
//  XMM2  XMM1  PSUBSW,       // XMM1[15:0]     - XMM2[15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    - XMM2[31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    - XMM2[47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112] - XMM2[127:112]  -> XMM1[127:112]
//
//  ST2  ST1  PSUBSW,         // ST1[15:0]     - ST2[15:0]     -> ST1[15:0]
//                            // ST1[31:16]    - ST2[31:16]    -> ST1[31:16]
//                            // ST1[47:32]    - ST2[47:32]    -> ST1[47:32]
//                            // ST1[63:48]    - ST2[63:48]    -> ST1[63:48]
//
//  XMM1 <-  XMM2  PSUBSW, // XMM1[15:0]     - XMM2[15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    - XMM2[31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    - XMM2[47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112] - XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubusbcomma ( PSUBUSB, )
//
// C prototype:
//  void dg_forthpsubusbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBUSB instruction. This sequence subtracts each unsigned byte integer
//   in the source from the corresponding unsigned byte integer in the destination.
//   Then if the results exceed the range of an unsigned byte value,
//   the results are limited to the least possible unsigned byte
//    value, which is 0. Then the results are put into the destination.
//   So if you do 1 - 2, you get 0, not 255.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBUSB,  // XMM1[7:0]     - [RBX][7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    - [RBX][15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   - [RBX][23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] - [RBX][127:120] -> XMM1[127:120]
//
//  RBX [R]  ST1  PSUBUSB,   // ST1[7:0]     - [RBX][7:0]     -> ST1[7:0]
//                           // ST1[15:8]    - [RBX][15:8]    -> ST1[15:8]
//                           // ST1[23:16]   - [RBX][23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   - [RBX][63:56]   -> ST1[63:56]
//
//  XMM2  XMM1  PSUBUSB,     // XMM1[7:0]     - XMM2[7:0]     -> XMM1[7:0]
//                           // XMM1[15:8]    - XMM2[15:8]    -> XMM1[15:8]
//                           // XMM1[23:16]   - XMM2[23:16]   -> XMM1[23:16]
//                           // ...
//                           // XMM1[127:120] - XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PSUBUSB,       // ST1[7:0]     - ST2[7:0]     -> ST1[7:0]
//                           // ST1[15:8]    - ST2[15:8]    -> ST1[15:8]
//                           // ST1[23:16]   - ST2[23:16]   -> ST1[23:16]
//                           // ...
//                           // ST1[63:56]   - ST2[63:56]   -> ST1[63:56]
//
//  XMM1 <-  XMM2  PSUBUSB, // XMM1[7:0]     - XMM2[7:0]     -> XMM1[7:0]
//                             // XMM1[15:8]    - XMM2[15:8]    -> XMM1[15:8]
//                             // XMM1[23:16]   - XMM2[23:16]   -> XMM1[23:16]
//                             // ...
//                             // XMM1[127:120] - XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsubuswcomma ( PSUBUSW, )
//
// C prototype:
//  void dg_forthpsubuswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSUBUSW instruction. This sequence subtracts each 16 bit
//   integer in the source from the corresponding 16 bit integer in
//   the destination. Then, if the results of the subtractions exceed
//   the range of an unsigned 16 bit value, the results are limited to the
//   least possible unsigned 16 bit value, which is 0. Then the results are
//   put into the destination.
//   So if you do 2 - 1, you get 0, not HEX FFFF.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PSUBUSW,   // XMM1[15:0]     - [RBX][15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    - [RBX][31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    - [RBX][47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112] - [RBX][127:112]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PSUBUSW,    // ST1[15:0]     - [RBX][15:0]     -> ST1[15:0]
//                            // ST1[31:16]    - [RBX][31:16]    -> ST1[31:16]
//                            // ST1[47:32]    - [RBX][47:32]    -> ST1[47:32]
//                            // ST1[63:48]    - [RBX][63:48]    -> ST1[63:48]
//
//  XMM2  XMM1  PSUBUSW,      // XMM1[15:0]     - XMM2[15:0]     -> XMM1[15:0]
//                            // XMM1[31:16]    - XMM2[31:16]    -> XMM1[31:16]
//                            // XMM1[47:32]    - XMM2[47:32]    -> XMM1[47:32]
//                            // ...
//                            // XMM1[127:112] - XMM2[127:112]  -> XMM1[127:112]
//
//  ST2  ST1  PSUBUSW,        // ST1[15:0]     - ST2[15:0]     -> ST1[15:0]
//                            // ST1[31:16]    - ST2[31:16]    -> ST1[31:16]
//                            // ST1[47:32]    - ST2[47:32]    -> ST1[47:32]
//                            // ST1[63:48]    - ST2[63:48]    -> ST1[63:48]
//
//  XMM1 <-  XMM2  PSUBUSW, // XMM1[15:0]     - XMM2[15:0]     -> XMM1[15:0]
//                             // XMM1[31:16]    - XMM2[31:16]    -> XMM1[31:16]
//                             // XMM1[47:32]    - XMM2[47:32]    -> XMM1[47:32]
//                             // ...
//                             // XMM1[127:112] - XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthptestcomma ( PTEST, )
//
// C prototype:
//  void dg_forthptestcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PTEST instruction. This sequence does a binary and of the source
//   and destination and sets the zero flag if the result is 0, otherwise
//   the zero flag is cleared. The result is not stored.
//   This sequence also does a binary and of the source with the binary
//   inverse of the destination and sets the carry flag if the result is 0,
//   otherwise the carry flag is cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PTEST,     // [RBX][127:0] and XMM1[127:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // [RBX][127:0] and (not XMM1[127:0])
//                            //  1 -> CF otherwise 0 -> CF
//
//  XMM2  XMM1  PTEST,        // XMM2[127:0] and XMM1[127:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // XMM2[127:0] and (not XMM1[127:0])
//                            //  1 -> CF otherwise 0 -> CF
//
//  XMM1 <-  XMM2  PTEST,  // XMM2[127:0] and XMM1[127:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // XMM2[127:0] and (not XMM1[127:0])
//                            //  1 -> CF otherwise 0 -> CF
//
//  XMM1  XMM8  PTEST,        // XMM1[127:0] and XMM8[127:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // XMM1[127:0] and (not XMM8[127:0])
//                            //  1 -> CF otherwise 0 -> CF
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvptestcomma ( VPTEST, )
//
// C prototype:
//  void dg_forthvptestcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPTEST instruction. This sequence does a binary and of the source
//   and destination and sets the zero flag if the result is 0, otherwise
//   the zero flag is cleared. The result is not stored.
//   This sequence also does a binary and of the source with the binary
//   inverse of the destination and sets the carry flag if the result is 0,
//   otherwise the carry flag is cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VPTEST,    // [RBX][127:0] and XMM1[127:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // [RBX][127:0] and (not XMM1[127:0])
//                            //  1 -> CF otherwise 0 -> CF
//
//  XMM2  XMM1  VPTEST,       // XMM2[127:0] and XMM1[127:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // XMM2[127:0] and (not XMM1[127:0])
//                            //  1 -> CF otherwise 0 -> CF
//
//  XMM1 <-  XMM2  VPTEST, // XMM2[127:0] and XMM1[127:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // XMM2[127:0] and (not XMM1[127:0])
//                            //  1 -> CF otherwise 0 -> CF
//
//  YMM1  YMM8  VPTEST,       // YMM1[255:0] and YMM8[255:0] is 0 then
//                                1 -> ZF otherwise 0 -> ZF
//                            // YMM1[255:0] and (not YMM8[255:0])
//                            //  1 -> CF otherwise 0 -> CF
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpckhbwcomma ( PUNPCKHBW, )
//
// C prototype:
//  void dg_forthpunpckhbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKHBW instruction. This sequence moves bytes from the high half
//   of the source, and bytes from the high half of the destination to the
//   destination. The bytes are alternated from the source and destination with
//   the lowest byte from the destination ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKHBW,  // XMM1[71:64]    -> XMM1[7:0]
//                             // [RBX][71:64]   -> XMM1[15:8]
//                             // XMM1[79:72]    -> XMM1[23:16]
//                             // [RBX][79:72]   -> XMM1[31:24]
//                             // XMM1[87:80]    -> XMM1[39:32]
//                             // [RBX][87:80]   -> XMM1[47:40]
//                             // XMM1[95:88]    -> XMM1[55:48]
//                             // [RBX][95:88]   -> XMM1[63:56]
//                             // XMM1[103:96]   -> XMM1[71:64]
//                             // [RBX][103:96]  -> XMM1[79:72]
//                             // XMM1[111:104]  -> XMM1[87:80]
//                             // [RBX][111:104] -> XMM1[95:88]
//                             // XMM1[119:112]  -> XMM1[103:96]
//                             // [RBX][119:112] -> XMM1[111:104]
//                             // XMM1[127:120]  -> XMM1[119:112]
//                             // [RBX][127:120] -> XMM1[127:120]
//
//
//  RBX [R]  ST1  PUNPCKHBW,   // ST1[39:32]    -> ST1[7:0]
//                             // [RBX][39:32]  -> ST1[15:8]
//                             // ST1[47:40]    -> ST1[23:16]
//                             // [RBX][47:40]  -> ST1[31:24]
//                             // ST1[55:48]    -> ST1[39:32]
//                             // [RBX][55:48]  -> ST1[47:40]
//                             // ST1[63:56]    -> ST1[55:48]
//                             // [RBX][63:56]  -> ST1[63:56]
//
//  XMM2  XMM1  PUNPCKHBW,     // XMM1[71:64]   -> XMM1[7:0]
//                             // XMM2[71:64]   -> XMM1[15:8]
//                             // XMM1[79:72]   -> XMM1[23:16]
//                             // XMM2[79:72]   -> XMM1[31:24]
//                             // XMM1[87:80]   -> XMM1[39:32]
//                             // XMM2[87:80]   -> XMM1[47:40]
//                             // XMM1[95:88]   -> XMM1[55:48]
//                             // XMM2[95:88]   -> XMM1[63:56]
//                             // XMM1[103:96]  -> XMM1[71:64]
//                             // XMM2[103:96]  -> XMM1[79:72]
//                             // XMM1[111:104] -> XMM1[87:80]
//                             // XMM2[111:104] -> XMM1[95:88]
//                             // XMM1[119:112] -> XMM1[103:96]
//                             // XMM2[119:112] -> XMM1[111:104]
//                             // XMM1[127:120] -> XMM1[119:112]
//                             // XMM2[127:120] -> XMM1[127:120]
//
//  ST2  ST1  PUNPCKHBW,       // ST1[39:32]  -> ST1[7:0]
//                             // ST2[39:32]  -> ST1[15:8]
//                             // ST1[47:40]  -> ST1[23:16]
//                             // ST2[47:40]  -> ST1[31:24]
//                             // ST1[55:48]  -> ST1[39:32]
//                             // ST2[55:48]  -> ST1[47:40]
//                             // ST1[63:56]  -> ST1[55:48]
//                             // ST2[63:56]  -> ST1[63:56]
//
//  XMM1 <-  XMM2  PUNPCKHBW, // XMM1[71:64]   -> XMM1[7:0]
//                               // XMM2[71:64]   -> XMM1[15:8]
//                               // XMM1[79:72]   -> XMM1[23:16]
//                               // XMM2[79:72]   -> XMM1[31:24]
//                               // XMM1[87:80]   -> XMM1[39:32]
//                               // XMM2[87:80]   -> XMM1[47:40]
//                               // XMM1[95:88]   -> XMM1[55:48]
//                               // XMM2[95:88]   -> XMM1[63:56]
//                               // XMM1[103:96]  -> XMM1[71:64]
//                               // XMM2[103:96]  -> XMM1[79:72]
//                               // XMM1[111:104] -> XMM1[87:80]
//                               // XMM2[111:104] -> XMM1[95:88]
//                               // XMM1[119:112] -> XMM1[103:96]
//                               // XMM2[119:112] -> XMM1[111:104]
//                               // XMM1[127:120] -> XMM1[119:112]
//                               // XMM2[127:120] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpckhbwcomma ( VPUNPCKHBW, )
//
// C prototype:
//  void dg_forthvpunpckhbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKHBW instruction. This sequence moves bytes from the high half
//   of each 128 bit section of the source, and bytes from the high half of each
//   128 bit section of target y to the  destination. The bytes are alternated 
//   from the source and target y with the lowest byte from target y ending up 
//   in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKHBW,  
//                             // XMM0[71:64]    -> XMM1[7:0]
//                             // [RBX][71:64]   -> XMM1[15:8]
//                             // XMM0[79:72]    -> XMM1[23:16]
//                             // [RBX][79:72]   -> XMM1[31:24]
//                             // XMM0[87:80]    -> XMM1[39:32]
//                             // [RBX][87:80]   -> XMM1[47:40]
//                             // XMM0[95:88]    -> XMM1[55:48]
//                             // [RBX][95:88]   -> XMM1[63:56]
//                             // XMM0[103:96]   -> XMM1[71:64]
//                             // [RBX][103:96]  -> XMM1[79:72]
//                             // XMM0[111:104]  -> XMM1[87:80]
//                             // [RBX][111:104] -> XMM1[95:88]
//                             // XMM0[119:112]  -> XMM1[103:96]
//                             // [RBX][119:112] -> XMM1[111:104]
//                             // XMM0[127:120]  -> XMM1[119:112]
//                             // [RBX][127:120] -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPUNPCKHBW,     
//                             // YMM0[71:64]   -> YMM1[7:0]
//                             // YMM2[71:64]   -> YMM1[15:8]
//                             // YMM0[79:72]   -> YMM1[23:16]
//                             // YMM2[79:72]   -> YMM1[31:24]
//                             // YMM0[87:80]   -> YMM1[39:32]
//                             // YMM2[87:80]   -> YMM1[47:40]
//                             // YMM0[95:88]   -> YMM1[55:48]
//                             // YMM2[95:88]   -> YMM1[63:56]
//                             // YMM0[103:96]  -> YMM1[71:64]
//                             // YMM2[103:96]  -> YMM1[79:72]
//                             // YMM0[111:104] -> YMM1[87:80]
//                             // YMM2[111:104] -> YMM1[95:88]
//                             // YMM0[119:112] -> YMM1[103:96]
//                             // YMM2[119:112] -> YMM1[111:104]
//                             // YMM0[127:120] -> YMM1[119:112]
//                             // YMM2[127:120] -> YMM1[127:120]
//                             // ...
//                             // YMM0[255:248] -> YMM1[247:240]
//                             // YMM2[255:248] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKHBW, 
//                               // XMM0[71:64]   -> XMM1[7:0]
//                               // XMM2[71:64]   -> XMM1[15:8]
//                               // XMM0[79:72]   -> XMM1[23:16]
//                               // XMM2[79:72]   -> XMM1[31:24]
//                               // XMM0[87:80]   -> XMM1[39:32]
//                               // XMM2[87:80]   -> XMM1[47:40]
//                               // XMM0[95:88]   -> XMM1[55:48]
//                               // XMM2[95:88]   -> XMM1[63:56]
//                               // XMM0[103:96]  -> XMM1[71:64]
//                               // XMM2[103:96]  -> XMM1[79:72]
//                               // XMM0[111:104] -> XMM1[87:80]
//                               // XMM2[111:104] -> XMM1[95:88]
//                               // XMM0[119:112] -> XMM1[103:96]
//                               // XMM2[119:112] -> XMM1[111:104]
//                               // XMM0[127:120] -> XMM1[119:112]
//                               // XMM2[127:120] -> XMM1[127:120]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpckhwdcomma ( PUNPCKHWD, )
//
// C prototype:
//  void dg_forthpunpckhwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKHWD instruction. This sequence moves 16 bit values from the
//   high half of the source, and 16 bit values from the high half of the
//   destination to the destination. The 16 bit values are alternated from the
//   source and destination with the lowest 16 bit value from the destination
//   ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKHWD,  // XMM1[79:64]    -> XMM1[15:0]
//                             // [RBX][79:64]   -> XMM1[31:16]
//                             // XMM1[95:80]    -> XMM1[47:32]
//                             // [RBX][95:80]   -> XMM1[63:47]
//                             // XMM1[111:96]   -> XMM1[79:64]
//                             // [RBX][111:96]  -> XMM1[95:80]
//                             // XMM1[127:112]  -> XMM1[111:96]
//                             // [RBX][127:112] -> XMM1[127:112]
//
//
//  RBX [R]  ST1  PUNPCKHWD,   // ST1[47:32]    -> ST1[15:0]
//                             // [RBX][47:32]  -> ST1[31:16]
//                             // ST1[63:48]    -> ST1[47:32]
//                             // [RBX][63:48]  -> ST1[63:48]
//
//  XMM2  XMM1  PUNPCKHWD,     // XMM1[79:64]    -> XMM1[15:0]
//                             // XMM2[79:64]    -> XMM1[31:16]
//                             // XMM1[95:80]    -> XMM1[47:32]
//                             // XMM2[95:80]    -> XMM1[63:47]
//                             // XMM1[111:96]   -> XMM1[79:64]
//                             // XMM2[111:96]   -> XMM1[95:80]
//                             // XMM1[127:112]  -> XMM1[111:96]
//                             // XMM2[127:112]  -> XMM1[127:112]
//
//  ST2  ST1  PUNPCKHWD,       // ST1[47:32]   -> ST1[15:0]
//                             // ST2[47:32]  -> ST1[31:16]
//                             // ST1[63:48]   -> ST1[47:32]
//                             // ST2[63:48]  -> ST1[63:48]
//
//  XMM1 <-  XMM2  PUNPCKHWD, // XMM1[79:64]    -> XMM1[15:0]
//                               // XMM2[79:64]    -> XMM1[31:16]
//                               // XMM1[95:80]    -> XMM1[47:32]
//                               // XMM2[95:80]    -> XMM1[63:47]
//                               // XMM1[111:96]   -> XMM1[79:64]
//                               // XMM2[111:96]   -> XMM1[95:80]
//                               // XMM1[127:112]  -> XMM1[111:96]
//                               // XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpckhwdcomma ( VPUNPCKHWD, )
//
// C prototype:
//  void dg_forthvpunpckhwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKHWD instruction. This sequence moves 16 bit values from the
//   high half of each 128 bit section the source, and 16 bit values from the 
//   high half of each 128 bit section of target y to the destination. The 16 
//   bit values are alternated from the source and target y with the lowest 16 
//   bit value from target y ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKHWD,  
//                             // XMM0[79:64]    -> XMM1[15:0]
//                             // [RBX][79:64]   -> XMM1[31:16]
//                             // XMM0[95:80]    -> XMM1[47:32]
//                             // [RBX][95:80]   -> XMM1[63:47]
//                             // XMM0[111:96]   -> XMM1[79:64]
//                             // [RBX][111:96]  -> XMM1[95:80]
//                             // XMM0[127:112]  -> XMM1[111:96]
//                             // [RBX][127:112] -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPUNPCKHWD,     
//                             // YMM0[79:64]    -> YMM1[15:0]
//                             // YMM2[79:64]    -> YMM1[31:16]
//                             // YMM0[95:80]    -> YMM1[47:32]
//                             // YMM2[95:80]    -> YMM1[63:47]
//                             // YMM0[111:96]   -> YMM1[79:64]
//                             // YMM2[111:96]   -> YMM1[95:80]
//                             // YMM0[127:112]  -> YMM1[111:96]
//                             // YMM2[127:112]  -> YMM1[127:112]
//                             // ...
//                             // YMM0[255:240]  -> YMM1[239:224]
//                             // YMM2[255:240]  -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKHWD, 
//                               // XMM0[79:64]    -> XMM1[15:0]
//                               // XMM2[79:64]    -> XMM1[31:16]
//                               // XMM0[95:80]    -> XMM1[47:32]
//                               // XMM2[95:80]    -> XMM1[63:47]
//                               // XMM0[111:96]   -> XMM1[79:64]
//                               // XMM2[111:96]   -> XMM1[95:80]
//                               // XMM0[127:112]  -> XMM1[111:96]
//                               // XMM2[127:112]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpckhdqcomma ( PUNPCKHDQ, )
//
// C prototype:
//  void dg_forthpunpckhdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKHDQ instruction. This sequence moves 32 bit values from the
//   high half of the source, and 32 bit values from the high half of the
//   destination to the destination. The bytes are alternated from the source
//   and destination with the lowest 32 bit value from the destination ending
//   up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKHDQ,  // XMM1[95:64]    -> XMM1[31:0]
//                             // [RBX][95:64]   -> XMM1[63:32]
//                             // XMM1[127:96]   -> XMM1[95:64]
//                             // [RBX][127:96]  -> XMM1[127:96]
//
//
//  RBX [R]  ST1  PUNPCKHDQ,   // ST1[63:32]    -> ST1[31:0]
//                             // [RBX][63:32]  -> ST1[63:32]
//
//  XMM2  XMM1  PUNPCKHDQ,     // XMM1[95:64]   -> XMM1[31:0]
//                             // XMM2[95:64]   -> XMM1[63:32]
//                             // XMM1[127:96]  -> XMM1[95:64]
//                             // XMM2[127:96]  -> XMM1[127:96]
//
//  ST2  ST1  PUNPCKHDQ,       // ST1[63:32]  -> ST1[31:0]
//                             // ST2[63:32]  -> ST1[63:32]
//
//  XMM1 <-  XMM2  PUNPCKHDQ, // XMM1[95:64]   -> XMM1[31:0]
//                               // XMM2[95:64]   -> XMM1[63:32]
//                               // XMM1[127:96]  -> XMM1[95:64]
//                               // XMM2[127:96]  -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpckhdqcomma ( VPUNPCKHDQ, )
//
// C prototype:
//  void dg_forthvpunpckhdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKHDQ instruction. This sequence moves 32 bit values from the
//   high half of each 128 bit section of the source, and 32 bit values from the 
//   high half of each 128 bit section of target y to the destination. The bytes 
//   are alternated from the source and target y with the lowest 32 bit value 
//   from target y ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKHDQ,  
//                             // XMM0[95:64]    -> XMM1[31:0]
//                             // [RBX][95:64]   -> XMM1[63:32]
//                             // XMM0[127:96]   -> XMM1[95:64]
//                             // [RBX][127:96]  -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPUNPCKHDQ,     
//                             // YMM0[95:64]   -> YMM1[31:0]
//                             // YMM2[95:64]   -> YMM1[63:32]
//                             // YMM0[127:96]  -> YMM1[95:64]
//                             // YMM2[127:96]  -> YMM1[127:96]
//                             // YMM0[223:192] -> YMM1[159:128]
//                             // YMM2[223:192] -> YMM1[191:160]
//                             // YMM0[255:224] -> YMM1[223:192]
//                             // YMM2[255:224] -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKHDQ, 
//                               // XMM0[95:64]   -> XMM1[31:0]
//                               // XMM2[95:64]   -> XMM1[63:32]
//                               // XMM0[127:96]  -> XMM1[95:64]
//                               // XMM2[127:96]  -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpckhqdqcomma ( PUNPCKHQDQ, )
//
// C prototype:
//  void dg_forthpunpckhqdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKHQDQ instruction. This sequence moves the 64 bit value from
//   the high half of the source, and the 64 bit value from the high half of
//   the destination to the destination. The bytes are alternated from the
//   source and destination with the 64 bit value from the destination
//   ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKHQDQ,  // XMM1[127:64]  -> XMM1[63:0]
//                              // [RBX][127:64] -> XMM1[127:64]
//
//  XMM2  XMM1  PUNPCKHQDQ,     // XMM1[127:64] -> XMM1[63:0]
//                              // XMM2[127:64] -> XMM1[127:64]
//
//  XMM1 <-  XMM2  PUNPCKHQDQ,  // XMM1[127:64] -> XMM1[63:0]
//                                 // XMM2[127:64] -> XMM1[127:64]
//
//  XMM1  XMM8  PUNPCKHQDQ,     // XMM8[127:64] -> XMM8[63:0]
//                              // XMM1[127:64] -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpckhqdqcomma ( VPUNPCKHQDQ, )
//
// C prototype:
//  void dg_forthvpunpckhqdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKHQDQ instruction. This sequence moves the 64 bit value from
//   the high half of each 128 bit section of the source, and the 64 bit value 
//   from the high half of each 128 bit section of target y to the destination. 
//   The bytes are alternated from the source and destination with the 64 bit 
//   value from the destination ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKHQDQ,  
//                              // XMM0[127:64]  -> XMM1[63:0]
//                              // [RBX][127:64] -> XMM1[127:64]
//
//  XMM2  XMM0  XMM1  VPUNPCKHQDQ,     
//                              // XMM0[127:64] -> XMM1[63:0]
//                              // XMM2[127:64] -> XMM1[127:64]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKHQDQ,  
//                              // XMM0[127:64] -> XMM1[63:0]
//                              // XMM2[127:64] -> XMM1[127:64]
//
//  YMM1  YMM0  YMM8  VPUNPCKHQDQ,     
//                              // YMM0[127:64]  -> YMM8[63:0]
//                              // YMM1[127:64]  -> YMM8[127:64]
//                              // YMM0[255:192] -> YMM8[191:128]
//                              // YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpcklbwcomma ( PUNPCKLBW, )
//
// C prototype:
//  void dg_forthpunpcklbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKLBW instruction. This sequence moves bytes from the low half
//   of the source, and bytes from the low half of the destination to the
//   destination. The bytes are alternated from the source and destination with
//   the lowest byte from the destination ending up in the same lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKLBW,  // XMM1[7:0]    -> XMM1[7:0]
//                             // [RBX][7:0]   -> XMM1[15:8]
//                             // XMM1[15:8]   -> XMM1[23:16]
//                             // [RBX][15:8]  -> XMM1[31:24]
//                             // XMM1[23:16]  -> XMM1[39:32]
//                             // [RBX][23:16] -> XMM1[47:40]
//                             // XMM1[31:24]  -> XMM1[55:48]
//                             // [RBX][31:24] -> XMM1[63:56]
//                             // XMM1[39:32]  -> XMM1[71:64]
//                             // [RBX][39:32] -> XMM1[79:72]
//                             // XMM1[47:40]  -> XMM1[87:80]
//                             // [RBX][47:40] -> XMM1[95:88]
//                             // XMM1[55:48]  -> XMM1[103:96]
//                             // [RBX][55:49] -> XMM1[111:104]
//                             // XMM1[63:56]  -> XMM1[119:112]
//                             // [RBX][63:56] -> XMM1[127:120]
//
//
//  RBX [R]  ST1  PUNPCKLBW,   // ST1[7:0]     -> ST1[7:0]
//                             // [RBX][7:0]   -> ST1[15:8]
//                             // ST1[15:8]    -> ST1[23:16]
//                             // [RBX][15:8]  -> ST1[31:24]
//                             // ST1[23:16]   -> ST1[39:32]
//                             // [RBX][23:16] -> ST1[47:40]
//                             // ST1[31:24]   -> ST1[55:48]
//                             // [RBX][31:24] -> ST1[63:56]
//
//  XMM2  XMM1  PUNPCKLBW,     // XMM1[7:0]   -> XMM1[7:0]
//                             // XMM2[7:0]   -> XMM1[15:8]
//                             // XMM1[15:8]  -> XMM1[23:16]
//                             // XMM2[15:8]  -> XMM1[31:24]
//                             // XMM1[23:16] -> XMM1[39:32]
//                             // XMM2[23:16] -> XMM1[47:40]
//                             // XMM1[31:24] -> XMM1[55:48]
//                             // XMM2[31:24] -> XMM1[63:56]
//                             // XMM1[39:32] -> XMM1[71:64]
//                             // XMM2[39:32] -> XMM1[79:72]
//                             // XMM1[47:40] -> XMM1[87:80]
//                             // XMM2[47:40] -> XMM1[95:88]
//                             // XMM1[55:48] -> XMM1[103:96]
//                             // XMM2[55:49] -> XMM1[111:104]
//                             // XMM1[63:56] -> XMM1[119:112]
//                             // XMM2[63:56] -> XMM1[127:120]
//
//  ST2  ST1  PUNPCKLBW,       // ST1[7:0]   -> ST1[7:0]
//                             // ST2[7:0]   -> ST1[15:8]
//                             // ST1[15:8]  -> ST1[23:16]
//                             // ST2[15:8]  -> ST1[31:24]
//                             // ST1[23:16] -> ST1[39:32]
//                             // ST2[23:16] -> ST1[47:40]
//                             // ST1[31:24] -> ST1[55:48]
//                             // ST2[31:24] -> ST1[63:56]
//
//  XMM1 <-  XMM2  PUNPCKLBW, // XMM1[7:0]   -> XMM1[7:0]
//                               // XMM2[7:0]   -> XMM1[15:8]
//                               // XMM1[15:8]  -> XMM1[23:16]
//                               // XMM2[15:8]  -> XMM1[31:24]
//                               // XMM1[23:16] -> XMM1[39:32]
//                               // XMM2[23:16] -> XMM1[47:40]
//                               // XMM1[31:24] -> XMM1[55:48]
//                               // XMM2[31:24] -> XMM1[63:56]
//                               // XMM1[39:32] -> XMM1[71:64]
//                               // XMM2[39:32] -> XMM1[79:72]
//                               // XMM1[47:40] -> XMM1[87:80]
//                               // XMM2[47:40] -> XMM1[95:88]
//                               // XMM1[55:48] -> XMM1[103:96]
//                               // XMM2[55:49] -> XMM1[111:104]
//                               // XMM1[63:56] -> XMM1[119:112]
//                               // XMM2[63:56] -> XMM1[127:120]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpcklbwcomma ( VPUNPCKLBW, )
//
// C prototype:
//  void dg_forthvpunpcklbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKLBW instruction. This sequence moves bytes from the low half
//   of each 128 bit section of the source, and bytes from the low half of each
//   128 bit section of target y to the  destination. The bytes are alternated 
//   from the source and target y with the lowest byte from target y ending up 
//   in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKLBW,  
//                             // XMM0[7:0]    -> XMM1[7:0]
//                             // [RBX][7:0]   -> XMM1[15:8]
//                             // XMM0[15:8]   -> XMM1[23:16]
//                             // [RBX][15:8]  -> XMM1[31:24]
//                             // XMM0[23:16]  -> XMM1[39:32]
//                             // [RBX][23:16] -> XMM1[47:40]
//                             // XMM0[31:24]  -> XMM1[55:48]
//                             // [RBX][31:24] -> XMM1[63:56]
//                             // XMM0[39:32]  -> XMM1[71:64]
//                             // [RBX][39:32] -> XMM1[79:72]
//                             // XMM0[47:40]  -> XMM1[87:80]
//                             // [RBX][47:40] -> XMM1[95:88]
//                             // XMM0[55:48]  -> XMM1[103:96]
//                             // [RBX][55:49] -> XMM1[111:104]
//                             // XMM0[63:56]  -> XMM1[119:112]
//                             // [RBX][63:56] -> XMM1[127:120]
//
//  YMM2  YMM0  YMM1  VPUNPCKLBW,     
//                             // YMM0[7:0]   -> YMM1[7:0]
//                             // YMM2[7:0]   -> YMM1[15:8]
//                             // YMM0[15:8]  -> YMM1[23:16]
//                             // YMM2[15:8]  -> YMM1[31:24]
//                             // YMM0[23:16] -> YMM1[39:32]
//                             // YMM2[23:16] -> YMM1[47:40]
//                             // YMM0[31:24] -> YMM1[55:48]
//                             // YMM2[31:24] -> YMM1[63:56]
//                             // YMM0[39:32] -> YMM1[71:64]
//                             // YMM2[39:32] -> YMM1[79:72]
//                             // YMM0[47:40] -> YMM1[87:80]
//                             // YMM2[47:40] -> YMM1[95:88]
//                             // YMM0[55:48] -> YMM1[103:96]
//                             // YMM2[55:49] -> YMM1[111:104]
//                             // YMM0[63:56] -> YMM1[119:112]
//                             // YMM2[63:56] -> YMM1[127:120]
//                             // YMM0[135:128] -> YMM1[135:128]
//                             // YMM2[135:128] -> YMM1[143:136]
//                             // ...
//                             // YMM0[191:184] -> YMM1[247:240]
//                             // YMM2[191:184] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKLBW, 
//                               // XMM0[7:0]   -> XMM1[7:0]
//                               // XMM2[7:0]   -> XMM1[15:8]
//                               // XMM0[15:8]  -> XMM1[23:16]
//                               // XMM2[15:8]  -> XMM1[31:24]
//                               // XMM0[23:16] -> XMM1[39:32]
//                               // XMM2[23:16] -> XMM1[47:40]
//                               // XMM0[31:24] -> XMM1[55:48]
//                               // XMM2[31:24] -> XMM1[63:56]
//                               // XMM0[39:32] -> XMM1[71:64]
//                               // XMM2[39:32] -> XMM1[79:72]
//                               // XMM0[47:40] -> XMM1[87:80]
//                               // XMM2[47:40] -> XMM1[95:88]
//                               // XMM0[55:48] -> XMM1[103:96]
//                               // XMM2[55:49] -> XMM1[111:104]
//                               // XMM0[63:56] -> XMM1[119:112]
//                               // XMM2[63:56] -> XMM1[127:120]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpcklwdcomma ( PUNPCKLWD, )
//
// C prototype:
//  void dg_forthpunpcklwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKLWD instruction. This sequence moves 16 bit values from the
//   low half of the source, and 16 bit values from the low half of the
//   destination to the destination. The 16 bit values are alternated from the
//   source and destination with the lowest 16 bit value from the destination
//   ending up in the same lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKLWD,  // XMM1[15:0]   -> XMM1[15:0]
//                             // [RBX][15:0]  -> XMM1[31:16]
//                             // XMM1[31:16]  -> XMM1[47:32]
//                             // [RBX][31:16] -> XMM1[63:47]
//                             // XMM1[47:32]  -> XMM1[79:64]
//                             // [RBX][47:32] -> XMM1[95:80]
//                             // XMM1[63:48]  -> XMM1[111:96]
//                             // [RBX][63:48] -> XMM1[127:112]
//
//
//  RBX [R]  ST1  PUNPCKLWD,   // ST1[15:0]    -> ST1[15:0]
//                             // [RBX][15:0]  -> ST1[31:16]
//                             // ST1[31:16]   -> ST1[47:32]
//                             // [RBX][31:16] -> ST1[63:48]
//
//  XMM2  XMM1  PUNPCKLWD,     // XMM1[15:0]  -> XMM1[15:0]
//                             // XMM2[15:0]  -> XMM1[31:16]
//                             // XMM1[31:16] -> XMM1[47:32]
//                             // XMM2[31:16] -> XMM1[63:47]
//                             // XMM1[47:32] -> XMM1[79:64]
//                             // XMM2[47:32] -> XMM1[95:80]
//                             // XMM1[63:48] -> XMM1[111:96]
//                             // XMM2[63:48] -> XMM1[127:112]
//
//  ST2  ST1  PUNPCKLWD,       // ST1[15:0]  -> ST1[15:0]
//                             // ST2[15:0]  -> ST1[31:16]
//                             // ST1[31:16] -> ST1[47:32]
//                             // ST2[31:16] -> ST1[63:48]
//
//  XMM1 <-  XMM2  PUNPCKLWD, // XMM1[15:0]  -> XMM1[15:0]
//                               // XMM2[15:0]  -> XMM1[31:16]
//                               // XMM1[31:16] -> XMM1[47:32]
//                               // XMM2[31:16] -> XMM1[63:47]
//                               // XMM1[47:32] -> XMM1[79:64]
//                               // XMM2[47:32] -> XMM1[95:80]
//                               // XMM1[63:48] -> XMM1[111:96]
//                               // XMM2[63:48] -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpcklwdcomma ( VPUNPCKLWD, )
//
// C prototype:
//  void dg_forthvpunpcklwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKLWD instruction. This sequence moves 16 bit values from the
//   low half of each 128 bit section the source, and 16 bit values from the 
//   low half of each 128 bit section of target y to the destination. The 16 
//   bit values are alternated from the source and target y with the lowest 16 
//   bit value from target y ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKLWD,  
//                             // XMM0[15:0]   -> XMM1[15:0]
//                             // [RBX][15:0]  -> XMM1[31:16]
//                             // XMM0[31:16]  -> XMM1[47:32]
//                             // [RBX][31:16] -> XMM1[63:47]
//                             // XMM0[47:32]  -> XMM1[79:64]
//                             // [RBX][47:32] -> XMM1[95:80]
//                             // XMM0[63:48]  -> XMM1[111:96]
//                             // [RBX][63:48] -> XMM1[127:112]
//
//  YMM2  YMM0  YMM1  VPUNPCKLWD,     
//                             // YMM0[15:0]  -> YMM1[15:0]
//                             // YMM2[15:0]  -> YMM1[31:16]
//                             // YMM0[31:16] -> YMM1[47:32]
//                             // YMM2[31:16] -> YMM1[63:47]
//                             // YMM0[47:32] -> YMM1[79:64]
//                             // YMM2[47:32] -> YMM1[95:80]
//                             // YMM0[63:48] -> YMM1[111:96]
//                             // YMM2[63:48] -> YMM1[127:112]
//                             // YMM0[79:64] -> YMM1[143:128]
//                             // YMM2[79:64] -> YMM1[159:144]
//                             // ...
//                             // YMM0[191:176] -> YMM1[239:224]
//                             // YMM2[191:176] -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKLWD, 
//                               // XMM0[15:0]  -> XMM1[15:0]
//                               // XMM2[15:0]  -> XMM1[31:16]
//                               // XMM0[31:16] -> XMM1[47:32]
//                               // XMM2[31:16] -> XMM1[63:47]
//                               // XMM0[47:32] -> XMM1[79:64]
//                               // XMM2[47:32] -> XMM1[95:80]
//                               // XMM0[63:48] -> XMM1[111:96]
//                               // XMM2[63:48] -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpckldqcomma ( PUNPCKLDQ, )
//
// C prototype:
//  void dg_forthpunpckldqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKLDQ instruction. This sequence moves 32 bit values from the
//   low half of the source, and 32 bit values from the low half of the
//   destination to the destination. The bytes are alternated from the source
//   and destination with the lowest 32 bit value from the destination ending
//   up in the same lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKLDQ,  // XMM1[31:0]   -> XMM1[31:0]
//                             // [RBX][31:0]  -> XMM1[63:32]
//                             // XMM1[63:32]  -> XMM1[95:64]
//                             // [RBX][63:32] -> XMM1[127:96]
//
//
//  RBX [R]  ST1  PUNPCKLDQ,   // ST1[31:0]   -> ST1[31:0]
//                             // [RBX][31:0] -> ST1[63:32]
//
//  XMM2  XMM1  PUNPCKLDQ,     // XMM1[31:0]  -> XMM1[31:0]
//                             // XMM2[31:0]  -> XMM1[63:32]
//                             // XMM1[63:32] -> XMM1[95:64]
//                             // XMM2[63:32] -> XMM1[127:96]
//
//  ST2  ST1  PUNPCKLDQ,       // ST1[31:0] -> ST1[31:0]
//                             // ST2[31:0] -> ST1[63:32]
//
//  XMM1 <-  XMM2  PUNPCKLDQ, // XMM1[31:0]   -> XMM1[31:0]
//                               // XMM2[31:0]   -> XMM1[63:32]
//                               // XMM1[63:32]  -> XMM1[95:64]
//                               // XMM2[63:32]  -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpckldqcomma ( VPUNPCKLDQ, )
//
// C prototype:
//  void dg_forthvpunpckldqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKLDQ instruction. This sequence moves 32 bit values from the
//   low half of each 128 bit section of the source, and 32 bit values from the 
//   low half of each 128 bit section of target y to the destination. The bytes 
//   are alternated from the source and target y with the lowest 32 bit value 
//   from target y ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKLDQ,  
//                             // XMM0[31:0]   -> XMM1[31:0]
//                             // [RBX][31:0]  -> XMM1[63:32]
//                             // XMM0[63:32]  -> XMM1[95:64]
//                             // [RBX][63:32] -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPUNPCKLDQ,     
//                             // YMM0[31:0]    -> YMM1[31:0]
//                             // YMM2[31:0]    -> YMM1[63:32]
//                             // YMM0[63:32]   -> YMM1[95:64]
//                             // YMM2[63:32]   -> YMM1[127:96]
//                             // YMM0[159:128] -> YMM1[159:128]
//                             // YMM2[159:128] -> YMM1[191:160]
//                             // YMM0[191:160] -> YMM1[223:192]
//                             // YMM2[191:160] -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKLDQ, 
//                             // XMM0[31:0]   -> XMM1[31:0]
//                             // XMM2[31:0]   -> XMM1[63:32]
//                             // XMM0[63:32]  -> XMM1[95:64]
//                             // XMM2[63:32]  -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpunpcklqdqcomma ( PUNPCKLQDQ, )
//
// C prototype:
//  void dg_forthpunpcklqdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PUNPCKLQDQ instruction. This sequence moves the 64 bit value from
//   the low half of the source, and the 64 bit value from the low half of
//   the destination to the destination. The bytes are alternated from the
//   source and destination with the 64 bit value from the destination
//   ending up in the same lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PUNPCKLQDQ,    // [RBX][63:0]  -> XMM1[127:64]
//
//  XMM2  XMM1  PUNPCKLQDQ,       // XMM2[63:0]   -> XMM1[127:64]
//
//  XMM1 <-  XMM2  PUNPCKLQDQ, // XMM2[63:0]   -> XMM1[127:64]
//
//  XMM1  XMM8  PUNPCKLQDQ,       // XMM1[63:0]   -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpunpcklqdqcomma ( VPUNPCKLQDQ, )
//
// C prototype:
//  void dg_forthvpunpcklqdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPUNPCKLQDQ instruction. This sequence moves the 64 bit value from
//   the low half of each 128 bit section of the source, and the 64 bit value 
//   from the low half of each 128 bit section of target y to the destination. 
//   The bytes are alternated from the source and destination with the 64 bit 
//   value from the destination ending up in the lowest position.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPUNPCKLQDQ,    // XMM0[63:0]   -> XMM1[63:0]
//                                       // [RBX][63:0]  -> XMM1[127:64]
//
//  XMM2  XMM0  XMM1  VPUNPCKLQDQ,       // XMM0[63:0]   -> XMM1[63:0]
//                                       // XMM2[63:0]   -> XMM1[127:64]
//
//  XMM1 <-  XMM0  XMM2  VPUNPCKLQDQ, // XMM0[63:0]   -> XMM1[63:0]
//                                       // XMM2[63:0]   -> XMM1[127:64]
//
//  YMM1  YMM0  YMM8  VPUNPCKLQDQ,       // YMM0[63:0]    -> YMM8[63:0]
//                                       // YMM1[63:0]    -> YMM8[127:64]
//                                       // YMM0[191:128] -> YMM8[191:128]
//                                       // YMM1[191:128] -> YMM8[255:191]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpxorcomma ( PXOR, )
//
// C prototype:
//  void dg_forthpxorcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PXOR instruction. This sequence does a binary xor of the two targets
//   and puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PXOR,    // [RBX][127:0] xor XMM1[127:0] -> XMM1[127:0]
//
//  RBX [R]  ST1  PXOR,     // [RBX][63:0] xor ST1[63:0] -> ST1[63:0]
//
//  XMM2  XMM1  PXOR,       // XMM2[127:0] xor XMM1[127:0] -> XMM1[127:0]
//
//  ST2  ST1  PXOR,         // ST2[63:0] xor ST1[63:0] -> ST1[63:0]
//
//  XMM1 <-  XMM2  PXOR, // XMM2[127:0] xor XMM1[127:0] -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpxorcomma ( VPXOR, )
//
// C prototype:
//  void dg_forthvpxorcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPXOR instruction. This opcode sequence does a binary xor of
//   target y with the source and puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPXOR,  
//                           // XMM0[127:0] xor [RBX][127:0] -> XMM1[127:0]
//
//  XMM2  XMM0  XMM1  VPXOR,       
//                           // XMM0[127:0] xor XMM2[127:0] -> XMM1[127:0]
//
//  XMM1 <-  XMM0  XMM2  VPXOR, 
//                           // XMM0[127:0] xor XMM2[127:0] -> XMM1[127:0]
//
//  YMM1  YMM0  YMM8  VPXOR,       
//                           // YMM0[255:0] xor YMM1[255:0] -> YMM8[255:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaddubswcomma ( PMADDUBSW, )
//
// C prototype:
//  void dg_forthpmaddubswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMADDUBSW instruction. This sequence adds each unsigned byte from
//   the destination with the corresponding signed byte from the source to
//   produce intermediary signed 16 bit integer results. Then pairs of
//   results are added together, then if the new value is greater or less than
//   the range of a 16 bit signed integer, the new value is limited to the
//   greatest (HEX 7FFF) or least (HEX -8000) 16 bit signed integer. Then
//   the new value is stored to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  destination bytes are zero extended to 16 bits before addition
//  source bytes are sign extended to 16 bits before addition
//
//  RBX [R]  XMM1  PMADDUBSW,    //   [RBX][7:0] + XMM1[7:0] +
//                               //   [RBX][15:8] + XMM1[15:8]
//                               //   -> XMM1[15:0]
//                               //   [RBX][23:16] + XMM1[23:16] +
//                               //   [RBX][31:24] + XMM1[31:24]
//                               //   -> XMM1[31:16]
//                               //  ...
//                               //   [RBX][119:112] + XMM1[119:112] +
//                               //   [RBX][127:120] + XMM1[127:120]
//                               //   -> XMM1[127:112]
//
//  RBX [R]  ST1  PMADDUBSW,     //   [RBX][7:0] + ST1[7:0] +
//                               //   [RBX][15:8] + ST1[15:8]
//                               //   -> ST1[15:0]
//                               //   [RBX][23:16] + ST1[23:16] +
//                               //   [RBX][31:24] + ST1[31:24]
//                               //   -> ST1[31:16]
//                               //   [RBX][39:32] + ST1[39:32] +
//                               //   [RBX][47:40] + ST1[47:40]
//                               //   -> ST1[47:32]
//                               //   [RBX][55:48] + ST1[55:48] +
//                               //   [RBX][63:56] + ST1[63:56]
//                               //   -> ST1[63:48]
//
//  XMM2  XMM1  PMADDUBSW,       //   XMM2[7:0] + XMM1[7:0] +
//                               //   XMM2[15:8] + XMM1[15:8]
//                               //   -> XMM1[15:0]
//                               //   XMM2[23:16] + XMM1[23:16] +
//                               //   XMM2[31:24] + XMM1[31:24]
//                               //   -> XMM1[31:16]
//                               //  ...
//                               //   XMM2[119:112] + XMM1[119:112] +
//                               //   XMM2[127:120] + XMM1[127:120]
//                               //   -> XMM1[127:112]
//
//  ST2  ST1  PMADDUBSW,         //   ST2[7:0] + ST1[7:0] +
//                               //   ST2[15:8] + ST1[15:8]
//                               //   -> ST1[15:0]
//                               //   ST2[23:16] + ST1[23:16] +
//                               //   ST2[31:24] + ST1[31:24]
//                               //   -> ST1[31:16]
//                               //   ST2[39:32] + ST1[39:32] +
//                               //   ST2[47:40] + ST1[47:40]
//                               //   -> ST1[47:32]
//                               //   ST2[55:48] + ST1[55:48] +
//                               //   ST2[63:56] + ST1[63:56]
//                               //   -> ST1[63:48]
//
//  XMM1 <-  XMM2  PMADDUBSW, //   XMM2[7:0] + XMM1[7:0] +
//                               //   XMM2[15:8] + XMM1[15:8]
//                               //   -> XMM1[15:0]
//                               //   XMM2[23:16] + XMM1[23:16] +
//                               //   XMM2[31:24] + XMM1[31:24]
//                               //   -> XMM1[31:16]
//                               //  ...
//                               //   XMM2[119:112] + XMM1[119:112] +
//                               //   XMM2[127:120] + XMM1[127:120]
//                               //   -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaddubwdcomma ( PMADDUBWD, )
//
// C prototype:
//  void dg_forthpmaddubwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMADDUBWD instruction. This sequence adds each unsigned 16 bit
//   integer from the destination with the corresponding signed 16 bit integer
//   from the source to produce intermediary signed 32 bit integer results.
//   Then pairs of results are added together, then if the new value is
//   greater or less than the range of a 32 bit signed integer, the new value
//   is limited to the greatest (HEX 7FFFFFFF) or least (HEX -80000000) 32 bit
//   signed integer. Then the new value is stored to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  destination 16 bit values are zero extended to 32 bits before addition
//  source 16 bit values are sign extended to 32 bits before addition
//
//  RBX [R]  XMM1  PMADDUBWD,    //   [RBX][15:0] + XMM1[15:0] +
//                               //   [RBX][31:16] + XMM1[31:16]
//                               //   -> XMM1[31:0]
//                               //   [RBX][47:32] + XMM1[47:32] +
//                               //   [RBX][63:48] + XMM1[63:48]
//                               //   -> XMM1[63:32]
//                               //   [RBX][79:64] + XMM1[79:64] +
//                               //   [RBX][95:80] + XMM1[95:80]
//                               //   -> XMM1[95:64]
//                               //   [RBX][111:96] + XMM1[111:96] +
//                               //   [RBX][127:112] + XMM1[127:112]
//                               //   -> XMM1[127:96]
//
//  RBX [R]  ST1  PMADDUBWD,     //   [RBX][15:0] + ST1[15:0] +
//                               //   [RBX][31:16] + ST1[31:16]
//                               //   -> ST1[31:0]
//                               //   [RBX][47:32] + ST1[47:32] +
//                               //   [RBX][63:48] + ST1[63:48]
//                               //   -> ST1[63:32]
//
//  XMM2  XMM1  PMADDUBWD,       //   XMM2[15:0] + XMM1[15:0] +
//                               //   XMM2[31:16] + XMM1[31:16]
//                               //   -> XMM1[31:0]
//                               //   XMM2[47:32] + XMM1[47:32] +
//                               //   XMM2[63:48] + XMM1[63:48]
//                               //   -> XMM1[63:32]
//                               //   XMM2[79:64] + XMM1[79:64] +
//                               //   XMM2[95:80] + XMM1[95:80]
//                               //   -> XMM1[95:64]
//                               //   XMM2[111:96] + XMM1[111:96] +
//                               //   XMM2[127:112] + XMM1[127:112]
//                               //   -> XMM1[127:96]
//
//  ST2  ST1  PMADDUBWD,         //   ST2[15:0] + ST1[15:0] +
//                               //   ST2[31:16] + ST1[31:16]
//                               //   -> ST1[31:0]
//                               //   ST2[47:32] + ST1[47:32] +
//                               //   ST2[63:48] + ST1[63:48]
//                               //   -> ST1[63:32]
//
//  XMM1 <-  XMM2  PMADDUBWD, //   XMM2[15:0] + XMM1[15:0] +
//                               //   XMM2[31:16] + XMM1[31:16]
//                               //   -> XMM1[31:0]
//                               //   XMM2[47:32] + XMM1[47:32] +
//                               //   XMM2[63:48] + XMM1[63:48]
//                               //   -> XMM1[63:32]
//                               //   XMM2[79:64] + XMM1[79:64] +
//                               //   XMM2[95:80] + XMM1[95:80]
//                               //   -> XMM1[95:64]
//                               //   XMM2[111:96] + XMM1[111:96] +
//                               //   XMM2[127:112] + XMM1[127:112]
//                               //   -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsadbwcomma ( PSADBW, )
//
// C prototype:
//  void dg_forthpsadbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSADBW instruction. This sequence subtracts each unsigned byte in
//   the source from the corresponding unsigned byte in the destination then
//   gets the absolute value of each result. The unsigned results are then
//   added together and the sum is put into the destination.
//   This means only the lower two bytes of the destination will have
//   anything in them, the other bytes will be zeros.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  RBX [R]  XMM1  PSADBW,    //   ABS(XMM1[7:0] - [RBX][7:0]) +
//                            //   ABS(XMM1[15:8] - [RBX][15:8]) +
//                            //   ... +
//                            //   ABS(XMM1[127:120] - [RBX][127:120]) +
//                            //   -> XMM1[127:0]
//
//  RBX [R]  ST1  PSADBW,     //   ABS(ST1[7:0] - [RBX][7:0]) +
//                            //   ABS(ST1[15:8] - [RBX][15:8]) +
//                            //   ... +
//                            //   ABS(ST1[63:56] - [RBX][63:56]) +
//                            //   -> ST1[63:0]
//
//  XMM2  XMM1  PSADBW,       //   ABS(XMM1[7:0] - XMM2[7:0]) +
//                            //   ABS(XMM1[15:8] - XMM2[15:8]) +
//                            //   ... +
//                            //   ABS(XMM1[127:120] - XMM2[127:120]) +
//                            //   -> XMM1[127:0]
//
//  ST2  ST1  PSADBW,         //   ABS(ST1[7:0] - ST2[7:0]) +
//                            //   ABS(ST1[15:8] - ST2[15:8]) +
//                            //   ... +
//                            //   ABS(ST1[63:56] - ST2[63:56]) +
//                            //   -> ST1[63:0]
//
//  XMM1 <-  XMM2  PSADBW, //   ABS(XMM1[7:0] - XMM2[7:0]) +
//                            //   ABS(XMM1[15:8] - XMM2[15:8]) +
//                            //   ... +
//                            //   ABS(XMM1[127:120] - XMM2[127:120]) +
//                            //   -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsadbwcomma ( VPSADBW, )
//
// C prototype:
//  void dg_forthvpsadbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSADBW instruction. This sequence subtracts each unsigned byte in
//   the source from the corresponding unsigned byte in target y then
//   gets the absolute value of each result. The unsigned results are then
//   added together and the sum is put into the destination.
//   This means only the lower two bytes of the destination will have
//   anything in them, the other bytes will be zeros.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
//  RBX [R]  XMM0  XMM1  VPSADBW,    
//                            //   ABS(XMM0[7:0] - [RBX][7:0]) +
//                            //   ABS(XMM0[15:8] - [RBX][15:8]) +
//                            //   ... +
//                            //   ABS(XMM0[127:120] - [RBX][127:120]) +
//                            //   -> XMM0[127:0]
//
//  YMM2  YMM0  YMM1  VPSADBW,       
//                            //   ABS(YMM0[7:0] - YMM2[7:0]) +
//                            //   ABS(YMM0[15:8] - YMM2[15:8]) +
//                            //   ... +
//                            //   ABS(YMM0[255:248] - YMM2[255:248]) +
//                            //   -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPSADBW, 
//                            //   ABS(XMM0[7:0] - XMM2[7:0]) +
//                            //   ABS(XMM0[15:8] - XMM2[15:8]) +
//                            //   ... +
//                            //   ABS(XMM0[127:120] - XMM2[127:120]) +
//                            //   -> XMM0[127:0]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrcppscomma ( RCPPS, )
//
// C prototype:
//  void dg_forthrcppscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 RCPPS instruction. This sequence calculates 1/x for each single
//   precision floating point value in the source and puts the results into the
//   destination. Even if there is an exact value for 1/x, the answer will be
//   slightly off. For example, for the inverse of 1/4 you should get 4
//   (which is 3E800000), instead on my processor you get 3E7FF000.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  RCPPS,    // 1/[RBX][31:0]    -> XMM1[31:0]
//                           // 1/[RBX][63:32]   -> XMM1[63:32]
//                           // 1/[RBX][95:64]   -> XMM1[95:64]
//                           // 1/[RBX][127:96]  -> XMM1[127:96]
//
//  XMM2  XMM1  RCPPS,       // 1/XMM2[31:0]    -> XMM1[31:0]
//                           // 1/XMM2[63:32]   -> XMM1[63:32]
//                           // 1/XMM2[95:64]   -> XMM1[95:64]
//                           // 1/XMM2[127:96]  -> XMM1[127:96]
//
//  XMM1 <-  XMM2  RCPPS, // 1/XMM2[31:0]    -> XMM1[31:0]
//                           // 1/XMM2[63:32]   -> XMM1[63:32]
//                           // 1/XMM2[95:64]   -> XMM1[95:64]
//                           // 1/XMM2[127:96]  -> XMM1[127:96]
//
//  XMM1  XMM8  RCPPS,       // 1/XMM1[31:0]    -> XMM8[31:0]
//                           // 1/XMM1[63:32]   -> XMM8[63:32]
//                           // 1/XMM1[95:64]   -> XMM8[95:64]
//                           // 1/XMM1[127:96]  -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvrcppscomma ( VRCPPS, )
//
// C prototype:
//  void dg_forthvrcppscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VRCPPS instruction. This sequence calculates 1/x for each single
//   precision floating point value in the source and puts the results into the
//   destination. Even if there is an exact value for 1/x, the answer will be
//   slightly off. For example, for the inverse of 1/4 you should get 4
//   (which is 3E800000), instead on my processor you get 3E7FF000.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VRCPPS,   // 1/[RBX][31:0]    -> XMM1[31:0]
//                           // 1/[RBX][63:32]   -> XMM1[63:32]
//                           // 1/[RBX][95:64]   -> XMM1[95:64]
//                           // 1/[RBX][127:96]  -> XMM1[127:96]
//
//  XMM2  XMM1  VRCPPS,       // 1/XMM2[31:0]    -> XMM1[31:0]
//                           // 1/XMM2[63:32]   -> XMM1[63:32]
//                           // 1/XMM2[95:64]   -> XMM1[95:64]
//                           // 1/XMM2[127:96]  -> XMM1[127:96]
//
//  XMM1 <-  XMM2  VRCPPS, // 1/XMM2[31:0]    -> XMM1[31:0]
//                           // 1/XMM2[63:32]   -> XMM1[63:32]
//                           // 1/XMM2[95:64]   -> XMM1[95:64]
//                           // 1/XMM2[127:96]  -> XMM1[127:96]
//
//  YMM1  YMM8  VRCPPS,       // 1/YMM1[31:0]    -> YMM8[31:0]
//                           // 1/YMM1[63:32]   -> YMM8[63:32]
//                           // 1/YMM1[95:64]   -> YMM8[95:64]
//                           // 1/YMM1[127:96]  -> YMM8[127:96]
//                           // 1/YMM1[159:128] -> YMM8[159:128]
//                           // 1/YMM1[191:160] -> YMM8[191:160]
//                           // 1/YMM1[223:191] -> YMM8[223:191]
//                           // 1/YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrcpsscomma ( RCPSS, )
//
// C prototype:
//  void dg_forthrcpsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 RCPSS instruction. This sequence calculates 1/x for the single
//   precision floating point value in the lower 32 bits of the source and puts
//   the result into the lower 32 bits of the destination. Even if there is an
//   exact value for 1/x, the answer will be slightly off.
//   For example, for the inverse of 1/4 you should get 4 (which is 3E800000),
//   instead on my processor you get 3E7FF000.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  RCPSS,    // 1/[RBX][31:0]  -> XMM1[31:0]
//
//  XMM2  XMM1  RCPSS,       // 1/XMM2[31:0]   -> XMM1[31:0]
//
//  XMM1 <-  XMM2  RCPSS, // 1/XMM2[31:0]   -> XMM1[31:0]
//
//  XMM1  XMM8  RCPSS,       // 1/XMM1[31:0]   -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvrcpsscomma ( VRCPSS, )
//
// C prototype:
//  void dg_forthvrcpsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VRCPSS instruction. This sequence calculates 1/x for the single
//   precision floating point value in the lower 32 bits of the source and puts
//   the result into the lower 32 bits of the destination. Even if there is an
//   exact value for 1/x, the answer will be slightly off.
//   For example, for the inverse of 1/4 you should get 4 (which is 3E800000),
//   instead on my processor you get 3E7FF000.
//   The rest of the bits in the destination are copied from target y.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VRCPSS,    // 1/[RBX][31:0]  -> XMM1[31:0]
//                                  // XMM0[127:32]   -> XMM1[127:32]
//                    
//  XMM2  XMM0  XMM1  VRCPSS,       // 1/XMM2[31:0]   -> XMM1[31:0]
//                                  // XMM0[127:32]   -> XMM1[127:32]
//
//  XMM1 <-  XMM0  XMM2  VRCPSS, // 1/XMM2[31:0]   -> XMM1[31:0]
//                                  // XMM0[127:32]   -> XMM1[127:32]
//
//  XMM1  XMM0  XMM8  VRCPSS,       // 1/XMM1[31:0]   -> XMM8[31:0]
//                                  // XMM0[127:32]   -> XMM1[127:32]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrsqrtpscomma ( RSQRTPS, )
//
// C prototype:
//  void dg_forthrsqrtpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 RSQRTPS instruction. This sequence calculates 1/sqrt(x) for each single
//   precision floating point value in the source and puts the results into the
//   destination. Even if there is an exact value for 1/sqrt(x), the answer will be
//   slightly off.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  RSQRTPS,  // 1/sqrt([RBX][15:0])    -> XMM1[15:0]
//                           // 1/sqrt([RBX][31:16])   -> XMM1[31:16]
//                           // 1/sqrt([RBX][47:32])   -> XMM1[47:32]
//                           // 1/sqrt([RBX][63:48])   -> XMM1[63:48]
//                           // 1/sqrt([RBX][79:64])   -> XMM1[79:64]
//                           // 1/sqrt([RBX][95:80])   -> XMM1[95:80]
//                           // 1/sqrt([RBX][111:96])  -> XMM1[111:96]
//                           // 1/sqrt([RBX][127:112]) -> XMM1[127:112]
//
//  XMM2  XMM1  RSQRTPS,     // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//                           // 1/sqrt(XMM2[31:16])   -> XMM1[31:16]
//                           // 1/sqrt(XMM2[47:32])   -> XMM1[47:32]
//                           // 1/sqrt(XMM2[63:48])   -> XMM1[63:48]
//                           // 1/sqrt(XMM2[79:64])   -> XMM1[79:64]
//                           // 1/sqrt(XMM2[95:80])   -> XMM1[95:80]
//                           // 1/sqrt(XMM2[111:96])  -> XMM1[111:96]
//                           // 1/sqrt(XMM2[127:112]) -> XMM1[127:112]
//
//  XMM1 <-  XMM2  RSQRTPS, // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//                             // 1/sqrt(XMM2[31:16])   -> XMM1[31:16]
//                             // 1/sqrt(XMM2[47:32])   -> XMM1[47:32]
//                             // 1/sqrt(XMM2[63:48])   -> XMM1[63:48]
//                             // 1/sqrt(XMM2[79:64])   -> XMM1[79:64]
//                             // 1/sqrt(XMM2[95:80])   -> XMM1[95:80]
//                             // 1/sqrt(XMM2[111:96])  -> XMM1[111:96]
//                             // 1/sqrt(XMM2[127:112]) -> XMM1[127:112]
//
//  XMM1  XMM8  RSQRTPS,     // 1/sqrt(XMM1[15:0])    -> XMM8[15:0]
//                           // 1/sqrt(XMM1[31:16])   -> XMM8[31:16]
//                           // 1/sqrt(XMM1[47:32])   -> XMM8[47:32]
//                           // 1/sqrt(XMM1[63:48])   -> XMM8[63:48]
//                           // 1/sqrt(XMM1[79:64])   -> XMM8[79:64]
//                           // 1/sqrt(XMM1[95:80])   -> XMM8[95:80]
//                           // 1/sqrt(XMM1[111:96])  -> XMM8[111:96]
//                           // 1/sqrt(XMM1[127:112]) -> XMM8[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvrsqrtpscomma ( VRSQRTPS, )
//
// C prototype:
//  void dg_forthvrsqrtpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VRSQRTPS instruction. This sequence calculates 1/sqrt(x) for each single
//   precision floating point value in the source and puts the results into the
//   destination. Even if there is an exact value for 1/sqrt(x), the answer will be
//   slightly off.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VRSQRTPS,  // 1/sqrt([RBX][15:0])    -> XMM1[15:0]
//                            // 1/sqrt([RBX][31:16])   -> XMM1[31:16]
//                            // 1/sqrt([RBX][47:32])   -> XMM1[47:32]
//                            // 1/sqrt([RBX][63:48])   -> XMM1[63:48]
//                            // 1/sqrt([RBX][79:64])   -> XMM1[79:64]
//                            // 1/sqrt([RBX][95:80])   -> XMM1[95:80]
//                            // 1/sqrt([RBX][111:96])  -> XMM1[111:96]
//                            // 1/sqrt([RBX][127:112]) -> XMM1[127:112]
//
//  XMM2  XMM1  VRSQRTPS,    // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//                           // 1/sqrt(XMM2[31:16])   -> XMM1[31:16]
//                           // 1/sqrt(XMM2[47:32])   -> XMM1[47:32]
//                           // 1/sqrt(XMM2[63:48])   -> XMM1[63:48]
//                           // 1/sqrt(XMM2[79:64])   -> XMM1[79:64]
//                           // 1/sqrt(XMM2[95:80])   -> XMM1[95:80]
//                           // 1/sqrt(XMM2[111:96])  -> XMM1[111:96]
//                           // 1/sqrt(XMM2[127:112]) -> XMM1[127:112]
//
//  XMM1 <-  XMM2  VRSQRTPS, // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//                              // 1/sqrt(XMM2[31:16])   -> XMM1[31:16]
//                              // 1/sqrt(XMM2[47:32])   -> XMM1[47:32]
//                              // 1/sqrt(XMM2[63:48])   -> XMM1[63:48]
//                              // 1/sqrt(XMM2[79:64])   -> XMM1[79:64]
//                              // 1/sqrt(XMM2[95:80])   -> XMM1[95:80]
//                              // 1/sqrt(XMM2[111:96])  -> XMM1[111:96]
//                              // 1/sqrt(XMM2[127:112]) -> XMM1[127:112]
//
//  YMM1  YMM8  VRSQRTPS,    // 1/sqrt(YMM1[15:0])    -> YMM8[15:0]
//                           // 1/sqrt(YMM1[31:16])   -> YMM8[31:16]
//                           // 1/sqrt(YMM1[47:32])   -> YMM8[47:32]
//                           // 1/sqrt(YMM1[63:48])   -> YMM8[63:48]
//                           // 1/sqrt(YMM1[79:64])   -> YMM8[79:64]
//                           // 1/sqrt(YMM1[95:80])   -> YMM8[95:80]
//                           // 1/sqrt(YMM1[111:96])  -> YMM8[111:96]
//                           // 1/sqrt(YMM1[127:112]) -> YMM8[127:112]
//                           // ...
//                           // 1/sqrt(YMM1[255:240]) -> YMM8[255:240]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrsqrtsscomma ( RSQRTSS, )
//
// C prototype:
//  void dg_forthrsqrtsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 RSQRTSS instruction. This sequence calculates 1/sqrt(x) for the single
//   precision floating point value in the lower 16 bits of the source and puts the
//   result into the lower 16 bits of the destination. Even if there is an exact
//   value for 1/sqrt(x), the answer will be slightly off.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  RSQRTSS,  // 1/sqrt([RBX][15:0])    -> XMM1[15:0]
//
//  XMM2  XMM1  RSQRTSS,     // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//
//  XMM1 <-  XMM2  RSQRTSS, // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//
//  XMM1  XMM8  RSQRTSS,     // 1/sqrt(XMM1[15:0])    -> XMM8[15:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvrsqrtsscomma ( VRSQRTSS, )
//
// C prototype:
//  void dg_forthvrsqrtsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VRSQRTSS instruction. This sequence calculates 1/sqrt(x) for each single
//   precision floating point value in the source and puts the results into the
//   destination. Even if there is an exact value for 1/sqrt(x), the answer will be
//   slightly off.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 C instruction. This sequence calculates 1/sqrt(x) for the single
//   precision floating point value in the lower 16 bits of the source and puts the
//   result into the lower 16 bits of the destination. Even if there is an exact
//   value for 1/sqrt(x), the answer will be slightly off.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  VRSQRTSS,    // 1/sqrt([RBX][15:0])    -> XMM1[15:0]
//
//  XMM2  XMM1  VRSQRTSS,       // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//
//  XMM1 <-  XMM2  VRSQRTSS, // 1/sqrt(XMM2[15:0])    -> XMM1[15:0]
//
//  YMM1  YMM8  VRSQRTSS,       // 1/sqrt(YMM1[15:0])    -> YMM8[15:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or ymm register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsqrtpdcomma ( SQRTPD, )
//
// C prototype:
//  void dg_forthsqrtpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SQRTPD instruction. This opcode sequence gets the square root of
//   each double precision floating point value in the source and puts the
//   results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SQRTPD,      // sqrt([RBX][63:0])   -> XMM0[63:0]
//                              // sqrt([RBX][127:64]) -> XMM0[127:64]
//  XMM2  XMM0  SQRTPD,         // sqrt(XMM2[63:0])   -> XMM0[63:0]
//                              // sqrt(XMM2[127:64]) -> XMM0[127:64]
//  XMM2 <- XMM0  SQRTPD,    // sqrt(XMM0[63:0])   -> XMM2[63:0]
//                              // sqrt(XMM0[127:64]) -> XMM2[127:64]
//  XMM0 XMM8 SQRTPD,           // sqrt(XMM0[63:0])   -> XMM8[63:0]
//                              // sqrt(XMM0[127:64]) -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsqrtpdcomma ( VSQRTPD, )
//
// C prototype:
//  void dg_forthvsqrtpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VSQRTPD instruction. This opcode sequence gets the square root of
//   each double precision floating point value in the source and puts the
//   results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VSQRTPD,     // sqrt([RBX][63:0])   -> XMM0[63:0]
//                              // sqrt([RBX][127:64]) -> XMM0[127:64]
//
//  XMM2  XMM0  VSQRTPD,        // sqrt(XMM2[63:0])    -> XMM0[63:0]
//                              // sqrt(XMM2[127:64])  -> XMM0[127:64]
//
//  XMM2 <- XMM0  VSQRTPD,   // sqrt(XMM0[63:0])    -> XMM2[63:0]
//                              // sqrt(XMM0[127:64])  -> XMM2[127:64]
//
//  YMM0 YMM8 VSQRTPD,          // sqrt(YMM0[63:0])    -> YMM8[63:0]
//                              // sqrt(YMM0[127:64])  -> YMM8[127:64]
//                              // sqrt(YMM0[191:128]) -> YMM8[191:128]
//                              // sqrt(YMM0[255:192]) -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//  or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsqrtpscomma ( SQRTPS, )
//
// C prototype:
//  void dg_forthsqrtpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SQRTPS instruction. This opcode sequence gets the square root of
//   each single precision floating point value in the source and puts the
//   result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SQRTPS,      // sqrt([RBX][31:0])   -> XMM0[31:0]
//                              // sqrt([RBX][63:32])  -> XMM0[63:32]
//                              // sqrt([RBX][95:64])  -> XMM0[95:64]
//                              // sqrt([RBX][127:96]) -> XMM0[127:96]
//
//  XMM2  XMM0  SQRTPS,         // sqrt(XMM2[31:0])   -> XMM0[31:0]
//                              // sqrt(XMM2[63:32])  -> XMM0[63:32]
//                              // sqrt(XMM2[95:64])  -> XMM0[95:64]
//                              // sqrt(XMM2[127:96]) -> XMM0[127:96]
//
//  XMM2 <- XMM0  SQRTPS,    // sqrt(XMM0[31:0])   -> XMM0[31:0]
//                              // sqrt(XMM0[63:32])  -> XMM0[63:32]
//                              // sqrt(XMM0[95:64])  -> XMM0[95:64]
//                              // sqrt(XMM0[127:96]) -> XMM0[127:96]
//
//  XMM0 XMM8 SQRTPS,           // sqrt(XMM0[31:0])   -> XMM8[31:0]
//                              // sqrt(XMM0[63:32])  -> XMM8[63:32]
//                              // sqrt(XMM0[95:64])  -> XMM8[95:64]
//                              // sqrt(XMM0[127:96]) -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsqrtpscomma ( VSQRTPS, )
//
// C prototype:
//  void dg_forthvsqrtpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VSQRTPS instruction. This opcode sequence gets the square root of
//   each single precision floating point value in the source and puts the
//   result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VSQRTPS,     // sqrt([RBX][31:0])   -> XMM0[31:0]
//                              // sqrt([RBX][63:32])  -> XMM0[63:32]
//                              // sqrt([RBX][95:64])  -> XMM0[95:64]
//                              // sqrt([RBX][127:96]) -> XMM0[127:96]
//
//  XMM2  XMM0  VSQRTPS,        // sqrt(XMM2[31:0])   -> XMM0[31:0]
//                              // sqrt(XMM2[63:32])  -> XMM0[63:32]
//                              // sqrt(XMM2[95:64])  -> XMM0[95:64]
//                              // sqrt(XMM2[127:96]) -> XMM0[127:96]
//
//  XMM2 <- XMM0  VSQRTPS,   // sqrt(XMM0[31:0])   -> XMM0[31:0]
//                              // sqrt(XMM0[63:32])  -> XMM0[63:32]
//                              // sqrt(XMM0[95:64])  -> XMM0[95:64]
//                              // sqrt(XMM0[127:96]) -> XMM0[127:96]
//
//  YMM0 YMM8 VSQRTPS,          // sqrt(YMM0[31:0])    -> YMM8[31:0]
//                              // sqrt(YMM0[63:32])   -> YMM8[63:32]
//                              // sqrt(YMM0[95:64])   -> YMM8[95:64]
//                              // sqrt(YMM0[127:96])  -> YMM8[127:96]
//                              // sqrt(YMM0[159:128]) -> YMM8[159:128]
//                              // sqrt(YMM0[191:160]) -> YMM8[191:160]
//                              // sqrt(YMM0[223:191]) -> YMM8[223:192]
//                              // sqrt(YMM0[255:224]) -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsqrtsdcomma ( SQRTSD, )
//
// C prototype:
//  void dg_forthsqrtsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SQRTSD instruction. This opcode sequence gets the square root of
//   the double precision floating point value in the lower 64 bits of the
//   source and puts the result into the lower 64 bits of the destination.
//   The upper 64 bits are unmodified.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SQRTSD,      // sqrt([RBX][63:0]) -> XMM0[63:0]
//
//  XMM2  XMM0  SQRTSD,         // sqrt(XMM2[63:0]) -> XMM0[63:0]
//
//  XMM2 <- XMM0  SQRTSD,    // sqrt(XMM0[63:0]) -> XMM2[63:0]
//
//  XMM0 XMM8 SQRTSD,           // sqrt(XMM0[63:0]) -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsqrtsdcomma ( VSQRTSD, )
//
// C prototype:
//  void dg_forthvsqrtsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VSQRTSD instruction. This opcode sequence gets the square root of
//   the double precision floating point value in the lower 64 bits of the
//   source and puts the result into the lower 64 bits of the destination.
//   The upper 64 bits of target y are copied to the upper 64 bits of the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VSQRTSD,      // sqrt([RBX][63:0]) -> XMM0[63:0]
//                                     // XMM1[127:64]      -> XMM0[127:64]
//                                     // 0                 -> YMM0[255:128]
//
//  XMM2  XMM1  XMM0  VSQRTSD,         // sqrt(XMM2[63:0])  -> XMM0[63:0]
//                                     // XMM1[127:64]      -> XMM0[127:64]
//                                     // 0                 -> YMM0[255:128]
//
//  XMM2 <- XMM1  XMM0  VSQRTSD,    // sqrt(XMM0[63:0])  -> XMM2[63:0]
//                                     // XMM1[127:64]      -> XMM0[127:64]
//                                     // 0                 -> YMM0[255:128]
//
//  XMM0 XMM1  XMM8 VSQRTSD,           // sqrt(XMM0[63:0])  -> XMM8[63:0]
//                                     // XMM1[127:64]      -> XMM0[127:64]
//                                     // 0                 -> YMM8[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsqrtsscomma ( SQRTSS, )
//
// C prototype:
//  void dg_forthsqrtsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SQRTSS instruction. This opcode sequence gets the square root of the
//   single precision floating point value in the lower 32 bits of the source
//   and puts the result into the lower 32 bits of the destination.
//   The upper 48 bits of the destination are not changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SQRTSS,      // sqrt([RBX][31:0]) -> XMM0[31:0]
//
//  XMM2  XMM0  SQRTSS,         // sqrt(XMM2[31:0]) -> XMM0[31:0]
//
//  XMM2 <- XMM0  SQRTSS,    // sqrt(XMM0[31:0]) -> XMM2[31:0]
//
//  XMM0 XMM8 SQRTSS,           // sqrt(XMM0[31:0]) -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsqrtsscomma ( VSQRTSS, )
//
// C prototype:
//  void dg_forthvsqrtsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VSQRTSS instruction. This opcode sequence gets the square root of the
//   single precision floating point value in the lower 32 bits of the source
//   and puts the result into the lower 32 bits of the destination.
//   The upper 96 bits of target y are copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VSQRTSS,      // sqrt([RBX][31:0]) -> XMM0[31:0]
//                                     // XMM1[127:32]      -> XMM0[127:32]
//                                     // 0                 -> YMM0[255:128]
//
//  XMM2  XMM1  XMM0  VSQRTSS,         // sqrt(XMM2[31:0]) -> XMM0[31:0]
//                                     // XMM1[127:32]     -> XMM0[127:32]
//                                     // 0                -> YMM0[255:128]
//
//  XMM2 <- XMM1  XMM0  VSQRTSS,    // sqrt(XMM0[31:0]) -> XMM2[31:0]
//                                     // XMM1[127:32]     -> XMM0[127:32]
//                                     // 0                -> YMM0[255:128]
//
//  XMM0  XMM1  XMM8 VSQRTSS,          // sqrt(XMM0[31:0]) -> XMM8[31:0]
//                                     // XMM1[127:32]     -> XMM0[127:32]
//                                     // 0                -> YMM8[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsubpdcomma ( SUBPD, )
//
// C prototype:
//  void dg_forthsubpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SUBPD instruction. This opcode sequence subtracts each double
//   precision floating point value in the source from each double precision
//   floating point value in the destination and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SUBPD,      // XMM0[63:0]   - [RBX][63:0]   -> XMM0[63:0]
//                             // XMM0[127:64] - [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM0  SUBPD,         // XMM0[63:0]   - XMM2[63:0]    -> XMM0[63:0]
//                             // XMM0[127:64] - XMM2[127:64]  -> XMM0[127:64]
//
//  XMM2 <- XMM0  SUBPD,    // XMM2[63:0]   - XMM0[63:0]    -> XMM2[63:0]
//                             // XMM2[127:64] - XMM0[127:64]  -> XMM2[127:64]
//
//  XMM0 XMM8 SUBPD,           // XMM8[63:0]   - XMM0[63:0]    -> XMM8[63:0]
//                             // XMM8[127:64] - XMM0[127:64]  -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsubpdcomma ( VSUBPD, )
//
// C prototype:
//  void dg_forthvsubpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VSUBPD instruction. This opcode sequence subtracts each double
//   precision floating point value in the source from each double precision
//   floating point value in target y and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VSUBPD,      
//                       // XMM1[63:0]   - [RBX][63:0]   -> XMM0[63:0]
//                       // XMM1[127:64] - [RBX][127:64] -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VSUBPD,  
//                       // XMM1[63:0]   - XMM2[63:0]    -> XMM0[63:0]
//                       // XMM1[127:64] - XMM2[127:64]  -> XMM0[127:64]
//
//  XMM2 <- XMM1  XMM0  VSUBPD,    
//                       // XMM1[63:0]   - XMM0[63:0]    -> XMM2[63:0]
//                       // XMM1[127:64] - XMM0[127:64]  -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8 VSUBPD,           
//                       // YMM1[63:0]    - YMM0[63:0]    -> YMM8[63:0]
//                       // YMM1[127:64]  - YMM0[127:64]  -> YMM8[127:64]
//                       // YMM1[191:128] - YMM0[191:128] -> YMM8[191:128]
//                       // YMM1[255:192] - YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsubpscomma ( SUBPS, )
//
// C prototype:
//  void dg_forthsubpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SUBPS instruction. This opcode sequence subtracts each single
//   precision floating point value in the source from each single precision
//   floating point value in the destination and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SUBPS,    // XMM0[31:0]   - [RBX][31:0]   -> XMM0[31:0]
//                           // XMM0[63:32]  - [RBX][63:32]  -> XMM0[63:32]
//                           // XMM0[95:64]  - [RBX][95:64]  -> XMM0[95:64]
//                           // XMM0[127:96] - [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  SUBPS,       // XMM0[31:0]   - XMM2[31:0]   -> XMM0[31:0]
//                           // XMM0[63:32]  - XMM2[63:32]  -> XMM0[63:32]
//                           // XMM0[95:64]  - XMM2[95:64]  -> XMM0[95:64]
//                           // XMM0[127:96] - XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM0  SUBPS,  // XMM0[31:0]   - XMM0[31:0]   -> XMM0[31:0]
//                           // XMM0[63:32]  - XMM0[63:32]  -> XMM0[63:32]
//                           // XMM0[95:64]  - XMM0[95:64]  -> XMM0[95:64]
//                           // XMM0[127:96] - XMM0[127:96] -> XMM0[127:96]
//
//  XMM0  XMM8  SUBPS,       // XMM8[31:0]   - XMM0[31:0]   -> XMM8[31:0]
//                           // XMM8[63:32]  - XMM0[63:32]  -> XMM8[63:32]
//                           // XMM8[95:64]  - XMM0[95:64]  -> XMM8[95:64]
//                           // XMM8[127:96] - XMM0[127:96] -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsubpscomma ( VSUBPS, )
//
// C prototype:
//  void dg_forthvsubpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VSUBPS instruction. This opcode sequence subtracts each single
//   precision floating point value in the source from each single precision
//   floating point value in target y and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VSUBPS,   
//                           // XMM1[31:0]   - [RBX][31:0]   -> XMM0[31:0]
//                           // XMM1[63:32]  - [RBX][63:32]  -> XMM0[63:32]
//                           // XMM1[95:64]  - [RBX][95:64]  -> XMM0[95:64]
//                           // XMM1[127:96] - [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VSUBPS,      
//                           // XMM1[31:0]   - XMM2[31:0]   -> XMM0[31:0]
//                           // XMM1[63:32]  - XMM2[63:32]  -> XMM0[63:32]
//                           // XMM1[95:64]  - XMM2[95:64]  -> XMM0[95:64]
//                           // XMM1[127:96] - XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1  XMM0  VSUBPS, 
//                           // XMM1[31:0]   - XMM0[31:0]   -> XMM0[31:0]
//                           // XMM1[63:32]  - XMM0[63:32]  -> XMM0[63:32]
//                           // XMM1[95:64]  - XMM0[95:64]  -> XMM0[95:64]
//                           // XMM1[127:96] - XMM0[127:96] -> XMM0[127:96]
//
//  YMM0  YMM1  YMM8  VSUBPS,      
//                           // YMM1[31:0]    - YMM0[31:0]    -> YMM8[31:0]
//                           // YMM1[63:32]   - YMM0[63:32]   -> YMM8[63:32]
//                           // YMM1[95:64]   - YMM0[95:64]   -> YMM8[95:64]
//                           // YMM1[127:96]  - YMM0[127:96]  -> YMM8[127:96]
//                           // YMM1[159:128] - YMM0[159:128] -> YMM8[159:128]
//                           // YMM1[191:160] - YMM0[191:160] -> YMM8[191:160]
//                           // YMM1[223:191] - YMM0[223:191] -> YMM8[223:191]
//                           // YMM1[255:224] - YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsubsdcomma ( SUBSD, )
//
// C prototype:
//  void dg_forthsubsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SUBSD instruction. This opcode sequence subtracts the double
//   precision floating point value in the lower 64 bits of the source from the
//   double precision floating point value in the lower 64 bits of the destination
//   and puts the result into the lower 64 bits of the destination.
//   The upper 64 bits are unmodified.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SUBSD,   // XMM0[63:0] - [RBX][63:0] -> XMM0[63:0]
//
//  XMM2  XMM0  SUBSD,      // XMM0[63:0] - XMM2[63:0] -> XMM0[63:0]
//
//  XMM2 <- XMM0  SUBSD, // XMM2[63:0] - XMM0[63:0] -> XMM2[63:0]
//
//  XMM0  XMM8  SUBSD,      // XMM8[63:0] - XMM0[63:0] -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsubsdcomma ( VSUBSD, )
//
// C prototype:
//  void dg_forthvsubsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VSUBSD instruction. This opcode sequence subtracts the double
//   precision floating point value in the lower 64 bits of the source from the
//   double precision floating point value in the lower 64 bits of target y
//   and puts the result into the lower 64 bits of the destination.
//   The upper 64 bits are copied from target y.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VSUBSD,   // XMM1[63:0] - [RBX][63:0] -> XMM0[63:0]
//                                 // XMM1[127:64]             -> XMM0[127:64]
//                                 // 0                        -> YMM0[255:128]
//
//  XMM2  XMM1  XMM0  VSUBSD,      // XMM1[63:0] - XMM2[63:0] -> XMM0[63:0]
//                                 // XMM1[127:64]            -> XMM0[127:64]
//                                 // 0                       -> YMM0[255:128]
//
//  XMM2 <- XMM1  XMM0  VSUBSD, // XMM1[63:0] - XMM0[63:0] -> XMM2[63:0]
//                                 // XMM1[127:64]            -> XMM0[127:64]
//                                 // 0                       -> YMM0[255:128]
//
//  XMM0  XMM1  XMM8  VSUBSD,      // XMM1[63:0] - XMM0[63:0] -> XMM8[63:0]
//                                 // XMM1[127:64]            -> XMM0[127:64]
//                                 // 0                       -> YMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsubsscomma ( SUBSS, )
//
// C prototype:
//  void dg_forthsubsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 SUBSS instruction. This opcode sequence subtracts the single
//   precision floating point value in the lower 32 bits of the source
//   from the lower 32 bits of the destination and puts the result into the
//   lower 32 bits of the destination.
//   The upper 96 bits of the destination are not changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  SUBSS,   // XMM0[31:0] - [RBX][31:0] -> XMM0[31:0]
//
//  XMM2  XMM0  SUBSS,      // XMM0[31:0] - XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM0  SUBSS, // XMM2[31:0] - XMM0[31:0] -> XMM2[31:0]
//
//  XMM0 XMM8 SUBSS,        // XMM8[31:0] - XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvsubsscomma ( VSUBSS, )
//
// C prototype:
//  void dg_forthvsubsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these target x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VSUBSS instruction. This opcode sequence subtracts the single
//   precision floating point value in the lower 32 bits of the source
//   from the single precision floating point value in the lower 32 bits of 
//   target y and puts the result into the lower 32 bits of the destination.
//   The upper 96 bits of the destination are copied from target y.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VSUBSS,   // XMM1[31:0] - [RBX][31:0] -> XMM0[31:0]
//                                 // XMM1[127:32]             -> XMM0[127:32]
//                                 // 0                        -> YMM0[255:128]
//
//  XMM2  XMM1  XMM0  VSUBSS,      // XMM1[31:0] - XMM2[31:0]  -> XMM0[31:0]
//                                 // XMM1[127:32]             -> XMM0[127:32]
//                                 // 0                        -> YMM0[255:128]
//
//  XMM2 <-  XMM1  XMM0  VSUBSS, // XMM1[31:0] - XMM0[31:0]  -> XMM2[31:0]
//                                  // XMM1[127:32]             -> XMM0[127:32]
//                                  // 0                        -> YMM0[255:128]
//
//  XMM0  XMM1  XMM8 VSUBSS,       // XMM1[31:0] - XMM0[31:0]  -> XMM8[31:0]
//                                 // XMM1[127:32]             -> XMM0[127:32]
//                                 // 0                        -> YMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthucomisdcomma ( UCOMISD, )
//
// C prototype:
//  void dg_forthucomisdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 UCOMISD instruction. This opcode sequence subtracts the double
//   precision floating point value in the lower 64 bits of the source from the
//   double precision floating point value in the lower 64 bits of the destination
//   and sets the flags but does not store the result to the destination.
//   If the destination value is greater or equal to the source value, then the
//     carry flag is cleared, otherwise it is set.
//   If the destination value is equal to the source value, then the zero flag is
//    set, otherwise it is cleared.
//   If either the source or destination value is QNaN or SNaN, then the parity
//    flag is set, otherwise it is cleared.
//   A floating point exception is generated only if there is a SNaN, not if there
//    is an QNaN. This is how this instruction is different from COMISD.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  UCOMISD,   // XMM0[63:0] - [RBX][63:0]
//
//  XMM2  XMM0  UCOMISD,      // XMM0[63:0] - XMM2[63:0]
//
//  XMM2 <- XMM0  UCOMISD, // XMM2[63:0] - XMM0[63:0]
//
//  XMM0 XMM8 UCOMISD,        // XMM8[63:0] - XMM0[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvucomisdcomma ( VUCOMISD, )
//
// C prototype:
//  void dg_forthvucomisdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VUCOMISD instruction. This opcode sequence subtracts the double
//   precision floating point value in the lower 64 bits of the source from the
//   double precision floating point value in the lower 64 bits of the destination
//   and sets the flags but does not store the result to the destination.
//   If the destination value is greater or equal to the source value, then the
//     carry flag is cleared, otherwise it is set.
//   If the destination value is equal to the source value, then the zero flag is
//    set, otherwise it is cleared.
//   If either the source or destination value is QNaN or SNaN, then the parity
//    flag is set, otherwise it is cleared.
//   A floating point exception is generated only if there is a SNaN, not if there
//    is an QNaN. This is how this instruction is different from COMISD.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VUCOMISD,   // XMM0[63:0] - [RBX][63:0]
//
//  XMM2  XMM0  VUCOMISD,      // XMM0[63:0] - XMM2[63:0]
//
//  XMM2 <- XMM0  VUCOMISD, // XMM2[63:0] - XMM0[63:0]
//
//  XMM0  XMM8 VUCOMISD,       // XMM8[63:0] - XMM0[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthucomisscomma ( UCOMISS, )
//
// C prototype:
//  void dg_forthucomisscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 UCOMISS instruction. This opcode sequence subtracts the single
//   precision floating point value in the lower 32 bits of the source
//   from the lower 32 bits of the destination set the flags. The result
//   is not put into the destination.
//   If the destination value is greater or equal to the source value, then the
//     carry flag is cleared, otherwise it is set.
//   If the destination value is equal to the source value, then the zero flag is
//    set, otherwise it is cleared.
//   If either the source or destination value is QNaN or SNaN, then the parity
//    flag is set, otherwise it is cleared.
//   A floating point exception is generated only if there is a SNaN, not if there
//    is an QNaN. This is how this instruction is different from COMISD.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  UCOMISS,   // XMM0[31:0] - [RBX][31:0]
//
//  XMM2  XMM0  UCOMISS,      // XMM0[31:0] - XMM2[31:0]
//
//  XMM2 <- XMM0  UCOMISS, // XMM2[31:0] - XMM0[31:0]
//
//  XMM0 XMM8 UCOMISS,        // XMM8[31:0] - XMM0[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvucomisscomma ( VUCOMISS, )
//
// C prototype:
//  void dg_forthvucomisscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VUCOMISS instruction. This opcode sequence subtracts the single
//   precision floating point value in the lower 32 bits of the source
//   from the lower 32 bits of the destination set the flags. The result
//   is not put into the destination.
//   If the destination value is greater or equal to the source value, then the
//     carry flag is cleared, otherwise it is set.
//   If the destination value is equal to the source value, then the zero flag is
//    set, otherwise it is cleared.
//   If either the source or destination value is QNaN or SNaN, then the parity
//    flag is set, otherwise it is cleared.
//   A floating point exception is generated only if there is a SNaN, not if there
//    is an QNaN. This is how this instruction is different from COMISD.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VUCOMISS,   // XMM0[31:0] - [RBX][31:0]
//
//  XMM2  XMM0  VUCOMISS,      // XMM0[31:0] - XMM2[31:0]
//
//  XMM2 <- XMM0  VUCOMISS, // XMM2[31:0] - XMM0[31:0]
//
//  XMM0  XMM8  VUCOMISS,      // XMM8[31:0] - XMM0[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthunpckhpdcomma ( UNPCKHPD, )
//
// C prototype:
//  void dg_forthunpckhpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 UNPCKHPD instruction. This opcode sequence copies the high 64 bits
//   in the destination to the low 64 bits in the destination, also copies
//   the high 64 bits in the source to the high 64 bits in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  UNPCKHPD,   // XMM0[127:64]   -> XMM0[63:0]
//                             // [RBX][127:64]  -> XMM0[127:64]
//
//  XMM2  XMM0  UNPCKHPD,      // XMM0[127:64]   -> XMM0[63:0]
//                             // XMM2[127:64]  -> XMM0[127:64]
//
//  XMM2 <- XMM0  UNPCKHPD, // XMM2[127:64]   -> XMM2[63:0]
//                             // XMM0[127:64]  -> XMM2[127:64]
//
//  XMM0 XMM8 UNPCKHPD,        // XMM8[127:64]   -> XMM8[63:0]
//                             // XMM0[127:64]  -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvunpckhpdcomma ( VUNPCKHPD, )
//
// C prototype:
//  void dg_forthvunpckhpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VUNPCKHPD instruction. This opcode sequence copies the high 64 bits
//   of each 128 bit section in target y to the low 64 bits of each 128 bit section
//   in the destination, also copies the high 64 bits of each 128 bit section in 
//   the source to the high 64 bits of each 128 bit section in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VUNPCKHPD,  // XMM1[127:64]   -> XMM0[63:0]
//                                   // [RBX][127:64]  -> XMM0[127:64]
//
//  XMM2  XMM1  XMM0  VUNPCKHPD,     // XMM1[127:64]   -> XMM0[63:0]
//                                   // XMM2[127:64]   -> XMM0[127:64]
//
//  XMM2 <- XMM1  XMM0  VUNPCKHPD, // XMM1[127:64]  -> XMM2[63:0]
//                                    // XMM0[127:64]  -> XMM2[127:64]
//
//  YMM0  YMM1  YMM8  VUNPCKHPD,      // YMM1[127:64]  -> YMM8[63:0]
//                                    // YMM0[127:64]  -> YMM8[127:64]
//                                    // YMM1[255:192] -> YMM8[191:128]
//                                    // YMM0[255:192] -> YMM8[255:192]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthunpckhpscomma ( UNPCKHPS, )
//
// C prototype:
//  void dg_forthunpckhpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 UNPCKHPS instruction. This opcode sequence copies the two 32 bit
//   values in the upper 64 bits of the source and the two 32 bit values
//   in the upper 64 bits in the destination and puts the results into the
//   destination. The values are alternated between the ones from the source
//   and destination with the lowest one from the destination going to the
//   lowest position in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  UNPCKHPS, // XMM0[95:64]   -> XMM0[31:0]
//                           // [RBX][95:64]  -> XMM0[63:32]
//                           // XMM0[127:96]  -> XMM0[95:64]
//                           // [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM0  UNPCKHPS,    // XMM0[95:64]   -> XMM0[31:0]
//                           // XMM2[95:64]   -> XMM0[63:32]
//                           // XMM0[127:96]  -> XMM0[95:64]
//                           // XMM2[127:96]  -> XMM0[127:96]
//
//  XMM2 <- XMM0  UNPCKHPS, // XMM2[95:64]   -> XMM2[31:0]
//                             // XMM0[95:64]   -> XMM2[63:32]
//                             // XMM2[127:96]  -> XMM2[95:64]
//                             // XMM0[127:96]  -> XMM2[127:96]
//
//  XMM0 XMM8 UNPCKHPS,        // XMM8[95:64]   -> XMM8[31:0]
//                             // XMM0[95:64]   -> XMM8[63:32]
//                             // XMM8[127:96]  -> XMM8[95:64]
//                             // XMM0[127:96]  -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvunpckhpscomma ( VUNPCKHPS, )
//
// C prototype:
//  void dg_forthvunpckhpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VUNPCKHPS instruction. This opcode sequence copies the two 32 bit
//   values in the upper 64 bits of each 128 bit section of the source and the 
//   two 32 bit values in the upper 64 bits of each 128 bit sections in target y
//   and puts the results into the destination. The values are alternated 
//   between the ones from the source and target y with the lowest one from 
//   target y going to the lowest position in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VUNPCKHPS, // XMM1[95:64]   -> XMM0[31:0]
//                                  // [RBX][95:64]  -> XMM0[63:32]
//                                  // XMM1[127:96]  -> XMM0[95:64]
//                                  // [RBX][127:96] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VUNPCKHPS,    // XMM1[95:64]   -> XMM0[31:0]
//                                  // XMM2[95:64]   -> XMM0[63:32]
//                                  // XMM1[127:96]  -> XMM0[95:64]
//                                  // XMM2[127:96]  -> XMM0[127:96]
//
//  XMM2 <- XMM1  XMM0  VUNPCKHPS, // XMM1[95:64]   -> XMM2[31:0]
//                                    // XMM0[95:64]   -> XMM2[63:32]
//                                    // XMM1[127:96]  -> XMM2[95:64]
//                                    // XMM0[127:96]  -> XMM2[127:96]
//
//  YMM0  YMM1  YMM8  VUNPCKHPS,      // YMM1[95:64]   -> YMM8[31:0]
//                                    // YMM0[95:64]   -> YMM8[63:32]
//                                    // YMM1[127:96]  -> YMM8[95:64]
//                                    // YMM0[127:96]  -> YMM8[127:96]
//                                    // YMM1[223:192] -> YMM8[159:128]
//                                    // YMM0[223:192] -> YMM8[191:160]
//                                    // YMM1[255:224] -> YMM8[223:192]
//                                    // YMM0[255:224] -> YMM8[255:224]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthunpcklpdcomma ( UNPCKLPD, )
//
// C prototype:
//  void dg_forthunpcklpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 UNPCKLPD instruction. This opcode sequence copies
//   the low 64 bits in the source to the high 64 bits in the destination.
//   The low 64 bits in the destination are not changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  UNPCKLPD,   // [RBX][63:0] -> XMM0[127:64]
//
//  XMM2  XMM0  UNPCKLPD,      // XMM2[63:0]  -> XMM0[127:64]
//
//  XMM2 <- XMM0  UNPCKLPD, // XMM0[63:0]  -> XMM2[127:64]
//
//  XMM0 XMM8 UNPCKLPD,        // XMM0[63:0]  -> XMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvunpcklpdcomma ( VUNPCKLPD, )
//
// C prototype:
//  void dg_forthvunpcklpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VUNPCKLPD instruction. This opcode sequence copies
//   the low 64 bits in each 128 bit section the source to the high 64 bits of
//   each 128 section of the in the destination.
//   The low 64 bits of each 128 bit section in target y are copied to 
//   low 64 bits of each 128 bit section of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VUNPCKLPD,   // [RBX][63:0] -> XMM0[127:64]
//                                    // XMM1[63:0]  -> XMM0[63:0]
//
//  XMM2  XMM1  XMM0  VUNPCKLPD,      // XMM2[63:0]  -> XMM0[127:64]
//                                    // XMM1[63:0]  -> XMM0[63:0]
//
//  XMM2 <- XMM1  XMM0  VUNPCKLPD, // XMM0[63:0]  -> XMM2[127:64]
//                                    // XMM1[63:0]  -> XMM2[63:0]
//
//  YMM0  YMM1  YMM8 VUNPCKLPD,       // YMM0[63:0]    -> YMM8[127:64]
//                                    // YMM1[63:0]    -> YMM8[63:0]
//                                    // YMM0[191:128] -> YMM8[191:128]
//                                    // YMM1[191:128] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthunpcklpscomma ( UNPCKLPS, )
//
// C prototype:
//  void dg_forthunpcklpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 UNPCKLPS instruction. This opcode sequence copies the two 32 bit
//   values in the lower 64 bits of the source and the two 32 bit values
//   in the lower 64 bits in the destination and puts the results into the
//   destination. The values are alternated between the ones from the source
//   and destination with the lowest one from the destination staying in the
//   lowest position in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  UNPCKLPS, // XMM0[31:0]   -> XMM0[31:0]
//                           // [RBX][31:0]  -> XMM0[63:32]
//                           // XMM0[63:32]  -> XMM0[95:64]
//                           // [RBX][63:32] -> XMM0[127:96]
//
//  XMM2  XMM0  UNPCKLPS,    // XMM0[31:0]   -> XMM0[31:0]
//                           // XMM2[31:0]   -> XMM0[63:32]
//                           // XMM0[63:32]  -> XMM0[95:64]
//                           // XMM2[63:32]  -> XMM0[127:96]
//
//  XMM2 <- XMM0  UNPCKLPS, // XMM2[31:0]   -> XMM2[31:0]
//                             // XMM0[31:0]   -> XMM2[63:32]
//                             // XMM2[63:32]  -> XMM2[95:64]
//                             // XMM0[63:32]  -> XMM2[127:96]
//
//  XMM0 XMM8 UNPCKLPS,        // XMM8[31:0]   -> XMM8[31:0]
//                             // XMM0[31:0]   -> XMM8[63:32]
//                             // XMM8[63:32]  -> XMM8[95:64]
//                             // XMM0[63:32]  -> XMM8[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvunpcklpscomma ( VUNPCKLPS, )
//
// C prototype:
//  void dg_forthvunpcklpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VUNPCKLPS instruction. This opcode sequence copies the two 32 bit
//   values in the lower 64 bits of each 128 bit sections of the source and the 
//   two 32 bit values in the lower 64 bits of each 128 bit section of target y
//   and puts the results into the destination. The values are alternated 
//   between the ones from the source and target y with the lowest one from the
//   target y staying in the lowest position in the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VUNPCKLPS,  // XMM1[31:0]   -> XMM0[31:0]
//                                   // [RBX][31:0]  -> XMM0[63:32]
//                                   // XMM1[63:32]  -> XMM0[95:64]
//                                   // [RBX][63:32] -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VUNPCKLPS,    // XMM1[31:0]   -> XMM0[31:0]
//                                  // XMM2[31:0]   -> XMM0[63:32]
//                                  // XMM1[63:32]  -> XMM0[95:64]
//                                  // XMM2[63:32]  -> XMM0[127:96]
//
//  XMM2 <- XMM1  XMM0  VUNPCKLPS, // XMM1[31:0]   -> XMM2[31:0]
//                                    // XMM0[31:0]   -> XMM2[63:32]
//                                    // XMM1[63:32]  -> XMM2[95:64]
//                                    // XMM0[63:32]  -> XMM2[127:96]
//
//  YMM0  YMM1  YMM8 VUNPCKLPS,       // YMM1[31:0]    -> YMM8[31:0]
//                                    // YMM0[31:0]    -> YMM8[63:32]
//                                    // YMM1[63:32]   -> YMM8[95:64]
//                                    // YMM0[63:32]   -> YMM8[127:96]
//                                    // YMM1[159:128] -> YMM8[159:128]
//                                    // YMM0[159:128] -> YMM8[191:160]
//                                    // YMM1[191:160] -> YMM8[223:192]
//                                    // YMM0[191:160] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxorpdcomma ( XORPD, )
//
// C prototype:
//  void dg_forthxorpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 XORPD instruction. This sequence does a binary xor of the two targets
//   and puts the result into the destination. The intel docs says it operates
//   2 pairs of 64 bit targets but it's the same thing as two 128 bit targets.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XORPD,    // [RBX][127:0] xor XMM1[127:0] -> XMM1[127:0]
//
//  XMM2  XMM1  XORPD,       // XMM2[127:0] xor XMM1[127:0] -> XMM1[127:0]
//
//  XMM1 <-  XMM2  XORPD, // XMM2[127:0] xor XMM1[127:0] -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvxorpdcomma ( VXORPD, )
//
// C prototype:
//  void dg_forthvxorpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 XORPD instruction. This sequence does a binary xor of the source
//   target y and puts the result into the destination. The intel docs says it 
//   operates on 64 bit floating point values, but it's really just a 128 or 
//   256 binary xor.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM3  XMM1  VXORPD,    
//                            // [RBX][127:0] xor XMM3[127:0] -> XMM1[127:0]
//
//  XMM2  XMM3  XMM1  VXORPD, // XMM2[127:0] xor XMM3[127:0] -> XMM1[127:0]
//
//  YMM1 <-  YMM3  YMM2  VXORPD, 
//                           // YMM2[255:0] xor YMM3[255:0] -> YMM1[255:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxorpscomma ( XORPS, )
//
// C prototype:
//  void dg_forthxorpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 XORPS instruction. This sequence does a binary xor of the two targets
//   and puts the result into the destination. The intel docs says it operates
//   4 pairs of 32 bit targets but it's the same thing as two 128 bit targets.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XORPS,    // [RBX][127:0] xor XMM1[127:0] -> XMM1[127:0]
//
//  XMM2  XMM1  XORPS,       // XMM2[127:0] xor XMM1[127:0] -> XMM1[127:0]
//
//  XMM1 <-  XMM2  XORPS, // XMM2[127:0] xor XMM1[127:0] -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvxorpscomma ( VXORPS, )
//
// C prototype:
//  void dg_forthvxorpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VXORPS instruction. This sequence does a binary xor of the source
//   target y and puts the result into the destination. The intel docs says it 
//   operates on 32 bit floating point values, but it's really just a 128 or 
//   256 binary xor.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM3  XMM1  VXORPS,    
//                            // [RBX][127:0] xor XMM3[127:0] -> XMM1[127:0]
//
//  XMM2  XMM3  XMM1  VXORPS, // XMM2[127:0] xor XMM3[127:0] -> XMM1[127:0]
//
//  YMM1 <-  YMM3  YMM2  VXORPS, 
//                           // YMM2[255:0] xor YMM3[255:0] -> YMM1[255:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsbbcomma ( SBB, )
//
// C prototype:
//  void dg_forthsbbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//                                 otherwise, this must fit into a 32 bit integer
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and if N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled. If the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for an
//    x86 SBB instruction. This opcode sequence does:
//      destinationttarget – (sourcetarget + carryflag) and puts the result into the
//      destinationtarget.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
//  HEX 12348000 N  EAX  SBB,  // subtracts 12348000 plus the carry flag from 
//                             //  EAX to EAX
//  27 N  CL  SBB,             // subtracts 27 plus the carry flag from CL 
//                             //  to CL
//  AX  EBX [R]  SBB,          // subtracts AX plus the carry flag from 
//                             //  the 16 bit memory at the address in EBX to 
//                             //  the 16 memory at the address in EBX
//  ECX  EAX  SBB,             // subtracts ECX plus the carry flag from EAX 
//                             //  to EAX
//  -38 N  RAX R  SBB,         // subtracts -38 plus the carry flag from RAX 
//                             //  using the n8 to n64 modr/m sign extended 
//                             //  encoding
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem SBB, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] SBB,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsubcomma ( SUB, )
//
// C prototype:
//  void dg_forthsubcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. If N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for an
//    x86 SUB instruction. This opcode sequence does:
//      destinationttarget – sourcetarget and puts the result into the
//      destinationtarget.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  SUB,  // subtracts 12348000 from EAX to EAX
//  27 N  CL  SUB,             // subtracts 27 from CL to CL
//  AX  EBX [R]  SUB,          // subtracts AX from the 16 bit memory at
//                             //  the address in EBX to the 16 memory at
//                             //  the address in EBX
//  38 N  EDX [R]  32BIT SUB,  // size required, subtracts 38 from the 32 bit
//                             //  memory at the address in EDX to the 32 bit
//                             //  memory at the address in EDX
//  ECX  EAX  SUB,             // subtracts ECX from EAX to EAX
//  ECX <- EAX  SUB,        // subtracts EAX from ECX to ECX
//  -38 N  RAX R  SUB,         // subtracts -38 from RAX using the n8 to n64 
//                             //  modr/m sign extended encoding
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem SUB, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] SUB,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxorcomma ( XOR, )
//
// C prototype:
//  void dg_forthxorcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and if N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. 
//                                 R is optional.
//                                 Using R also forces some instructions to be 
//                                 encoded using the MODR encoding.
//                                 For INC, and DEC, in 32 bit mode; and also
//                                  PUSH, and POP, using R forces the use
//                                  of MODR encoding instead of opcode+r.
//                                 For ADD, ADC, AND, OR, XOR, SUB, SBB, and
//                                  CMP, add immediate to register a
//                                  instructions, using R forces the use of
//                                  MODR encoding instead of the reg a opcodes.
//                                 The MODR encoding for add 8 bit immediate 
//                                  sign extended to 32 or 64 bits is shorter
//                                  than the reg a opcode encoding. If you
//                                  want this shorter encoding, you have to
//                                  use R.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m is not supported.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 XOR instruction. This opcode sequence does:
//    destinationttarget exclusiveor sourcetarget and puts the result into the
//    destinationtarget.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  XOR,  // xors 12348000 with EAX to EAX
//  27 N  CL  XOR,             // xors 27 with CL to CL
//  AX  EBX [R]  XOR,          // xors AX with the 16 bit memory at the address
//                             //  in EBX to the 16 bit memory at the address
//                             //  in EBX
//  38 N  EDX [R]  32BIT XOR,  // size required, xors 38 with the 32 bit memory
//                             //  at the address in EDX to the 32 bit memory
//                             //  at the address in EDX
//  ECX  EAX  XOR,             // xors ECX with EAX to EAX
//  ECX <- EAX  XOR,        // xors EAX with ECX to ECX
//  -38 N  RAX R  XOR,         // xors -38 with RAX using the n8 to n64 modr/m 
//                             //  sign extended encoding
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem XOR, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] XOR,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthtestcomma ( TEST, )
//
// C prototype:
//  void dg_forthtestcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure 
//                                         which is used as the bufferhandle for 
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   datavalue datasize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended if needed
//   minimumimmediatesize         minimum encoding size in bytes for 
//                                 immediatevalue.
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and if N is larger than a signed
//                                 32 bit integer, multiple instructions 
//                                 are compiled and if the destination is memory
//                                 RAX is used.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 TEST instruction. This opcode sequence evaluates the 
//   destinationtarget binary anded with the sourcetarget and changes the
//   condition code flags based on the result of the and. The result is not
//   stored in the destination.
//    carry is cleared
//    zero flag is set if the result was 0
//    overflow flag is cleared
//    parity flag is set if there were an odd number of bits in the result
//    sign flag is set if the high bit of the result was set
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 12348000 N  EAX  TEST,  // ands 12348000 with EAX but does not store
//                              //  result to EAX, just sets the flags
//  27 N  CL  MOV,              // ands 27 with CL but does not store the
//                              //  result to CL, just sets the flags
//  AX  EBX [R]  MOV,           // ands AX with the 16 bit value in memory at
//                              //  the address in EBX but does not store
//                              //  the result, just sets the flags
//  38 N  EDX [R]  32BIT MOV,   // size required, ands 38 with 32 bit value
//                              //  in memory at the address in EDX but does
//                              //  not store the result, just sets the flags
//  ECX  EAX  MOV,              // ands ECX with EAX but does not store the
//                              //  result to EAX, just sets the flags
//  ECX <- EAX  MOV,         // ands EAX with ECX but does not store the
//                              //  result to ECX, just sets the flags...
//                              //  so actually direction does not matter
//                              //  but it will change how the instruction is
//                              //  encoded
//
// Note:
//  Only 1 target can be a memory target.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is memory
//   then  value N  RAX, MOV,  RAX mem TEST, is compiled.
//  If the address mode is 64 bits, the data size is 64 bits, and N is larger 
//   than what will fit in a signed 32 bit integer, and the destination is a
//   register then  register PUSH,  N register MOV,  register RSP [R] TEST,
//   register POP,  is compiled.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthnotcomma ( NOT, )
//
// C prototype:
//  void dg_forthnotcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist                    
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
// Data stack out:
//  none
//
//                                                              
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 NOT instruction. This opcode sequence flips all the bits in the
//   target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EAX  NOT,             // flips all bits in EAX
//  CL  NOT,              // flips all bits in CL
//  EDX [R]  32BIT  NOT,  // flips all bits of the 32 bit memory at the 
//                        //  address specified by EDX 
//                        //  size specifier is required
//  EBX [R]  8BIT  NOT,   // flips all bits of the 8 bit memory at the
//                        //  address specified by EBX,
//                        //  size specifier is required
//  R8 NOT,               // flips all bits in R8
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthnegcomma ( NEG, )
//
// C prototype:
//  void dg_forthnegcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.

//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist                    
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 NEG instruction. This opcode sequence does:
//    0 - target and puts the result in the target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EAX  NEG,             // EAX <- 0 - EAX
//  CL  NEG,              // CL <- 0 - CL
//  EDX [R]  32BIT  NEG,  // [EDX] <- 0 - [EDX] where EDX points to a 32 bit
//                        //  value in memory, size specifier is required
//                        //  
//  EBX [R]  8BIT  NEG,   // [EBX] <- 0 - [EBX] where EBX points to an 8 bit
//                        //  value in memory, size specifier is required
//  R9 NEG,               // R9 <- 0 - R9
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmulcomma ( MUL, )
//
// C prototype:
//  void dg_forthmulcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist                    
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 MUL instruction, which is an unsigned multiply.
//    This opcode sequence does different actions based on the DATASIZE of the
//    instruction:
//     if size is 8BIT then AX <- AL * target
//     if size is 16BIT then DX:AX <- AX * target
//     if size is 32BIT then EDX:EAX <- EAX * target
//     if size is 64BIT then RDX:RAX <- RAX * target
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  BL  MUL,                // multiplies BL with AL and puts result into AX
//  EDX [R]  32BIT  MUL,    // multiplies EAX with the 32 bit value at the
//                          //  memory location specified by EDX and puts the
//                          //  result into EDX:EAX, size specifier is required
//
//  RCX MUL,                // multiplies RAX with the 64 bit value in RCX
//                          //  and puts the result into RDX:RAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmulxcomma ( MULX, )
//
// C prototype:
//  void dg_forthmulxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for targetyparameterlist can
//   contain this addressing mode specifiers:
//
//   targetregister
//
//  The parameter list for targetxparameterlist and targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target x and
//   target z can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 MULX instruction. This opcode sequence multiplies the value in the
//   source target with the value in the EDX or RDX register depending on the
//   data size of the instruction, then puts the low half of the result into
//   target y, and the high half of the result into the destination.
//   The flags are not changed. If both destinations are the same register,
//   then only the high half of the result is put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RSI [R]  EAX  ECX  MULX,  // [RSI] * EDX -> ECX:EAX
//
//   RSI [R]  RAX  RCX  MULX,  // [RSI] * RDX -> RCX:RAX
//
//   RSI  RAX  RCX  MULX,      // RSI * RDX -> RCX:RAX
//
//   RCX <  RAX  RSI  MULX, // RSI * RDX -> RCX:RAX
//
// Note:
//  Putting reverse after any target makes the first target pushed the
//   destination target, and the third target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpushcomma ( PUSH, )
//
// C prototype:
//  void dg_forthpushcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist                    
//
//
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   immediatevalue minimumimmediatesize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               integer constant that gets sign extended to
//                                 the DATASIZE of the instruction
//                                 64BIT encoding uses multiple instructions
//                                 smaller encodings get sign extended
//   minimumimmediatesize         minimum encoding size in bytes for
//                                 immediatevalue. 
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. N can be any 64 bit integer but
//                                 will use multiple instructions if it does
//                                 not fit into a 32 bit sign extended value
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 You can use this to force larger encodings
//                                 for immediatevalue
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes
//                                 in 32 bit address mode this can be 2 or 4
//                                 in 64 bit address mode this can be 2 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
//                                                              
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 PUSH instruction.
//    This opcode sequence pushes the value of the target onto the return stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX 97235029 N  32BIT PUSH,     // pushes 97235029 to the return stack
//                                  //  size specifier required
//                                  //  32 bit address mode only
//  HEX 2761 N  16 PUSH,            // pushes 2761 to the return stack
//                                  //  size specifier required
//  EAX  PUSH,                      // pushes EAX to the return stack
//                                  //  32 bit address mode only
//  CX  PUSH,                       // pushes CX to the return stack
//  EDX [R]  32BIT PUSH,            // pushes the 32 bit value at the memory
//                                  //  location specified by EDX to the return
//                                  //  stack, size specifier required
//                                  //  32 bit address mode only
//  R8 PUSH,                        // pushes R8 to the return stack
//                                  //  64 bit address mode only
//
//  
// Note:
//    The data size of this instruction is limited to 16BIT or 32BIT in
//     32 bit address mode
//    The data size of this instruction is limited to 16BIT, or 64BIT
//     in 64 bit address mode but some operatings systems require the return
//     stack pointer always be 64 bit aligned which means you can't use the
//     16BIT push on those operating systems.
//    8BIT targets are not supported.
//    
// 
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpopcomma ( POP, )
//
// C prototype:
//  void dg_forthpopcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where 
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist                    
//
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   datavalue datasize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               32 bit value of an immediate target
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. N is limited to a signed 32 bit
//                                 integer.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes
//                                 in 32 bit address mode this can be 2 or 4
//                                 in 64 bit address mode this can be 2 or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 POP instruction.
//    This opcode sequence pops the value off the return stack and
//     stores the value into the target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EAX  POP,                      // pops EAX from the return stack
//                                 //  32 bit address mode only
//  CX  PUSH,                      // pops CX from the return stack
//  EDX [R]  32BIT POP,            // pops the return stack and stores the
//                                 //  32 bit value to the memory location
//                                 //  specified by EDX,
//                                 //  32 bit address mode only
//                                 //  size specifier required
//  R8 POP,                        // pop R8 from the return stack
//                                 //  64 bit address mode only
//
// Note:
//    The data size of this instruction is limited to 16BIT or 32BIT in
//     32 bit address mode
//    The data size of this instruction is limited to 16BIT, or 64BIT
//     in 64 bit address mode
//    8BIT targets are not supported.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_getshiftopcodetype
//
// C prototype: 
//  UINT64 dg_getshiftopcodetype (dg_Sibformatter* pmymodesf)
//
// Inputs:
//  dg_Sibformatter*        pmymodesf     pointer to a dg_Sibformatter structure
//                                         which holds a description of an
//                                         assembler instruction target.
//                                         This holds the mode target pulled from
//                                         the data stack.
//
// Outputs:
//  UINT64                  return        code for the type of shift instruction
//                                         which is one of:
//                                          0  for shift 1
//                                          1  for shift N
//                                          2  for shift CL
//                                          -1 for unsupported mode
//                              
// Action:
//   Checks the target type of the sib formatter to determine what kind of
//    shift opcode to compile.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_pullandcompileshiftop
//
// C prototype: 
//  void dg_pullandcompileshiftop (
//    Bufferhandle* pBHarrayhead,
//    UINT64 opcodeextension)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  UINT64        opcodeextension opcode extension for the specific shift
//                                 instruction
//
// Outputs:
//  none
//                              
// Action:
//   Pulls the target parameter list, then the mode parameter list, then
//    compiles the opcode sequence for the shift instruction determined by the
//    opcodeextension.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsetcomma ( SET, C->R8ORM8, )
//
// C prototype:
//  void dg_forthsetcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( targetparameterlist conditioncode -- )
//
// Data stack in:
//  targetparameterlist           parameter list for the target
//  conditioncode                 0-15, x86 code for conditional instructions
//                                 is one of:
//                                  VS   for overflow set
//                                  NV   for overflow clear or no overflow
//                                  CS   for carry set
//                                  NC   for overflow clear or no carry
//                                  ULT  for unsigned less than
//                                  ULE  for unsigned less than or equal
//                                  UGT  for unsigned greater than
//                                  UGE  for unsigned greater than or equal
//                                  ZS   for zero set
//                                  NZ   for zero clear or no zero
//                                  EQ   for equal or zero set
//                                  NE   for not equal or zero clear
//                                  SS   for sign set
//                                  NS   for no sign or sign clear
//                                  MI   for minus or sign set
//                                  PL   for plus or sign clear
//                                  PS   for parity set
//                                  NP   for no parity or parity clear
//                                  LT   for signed less than
//                                  GE   for signed greater than or equal
//                                  LE   for signed less than or equal
//                                  GT   for signed greater than
//                                  ALWAYS
//                                  NEVER
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist                    
//
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in 32 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//                                  
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls a conditioncode and one target from the data stack and compiles the 
//  opcode sequence for an x86 SETcc instruction. This opcode sequence 
//  unsigned extends a condition code flag bit to 8 bits and stores it into
//  an 8BIT target. Only 8BIT targets are supported.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  AL CS  SET,             // if carry flag is set, AL is set to 1
//                          // if carry flag is clear, AL is set to 0
//  EDX [R]  8BIT  NE SET,  // if zero flag is clear, the 8 bit memory at the
//                          //  address specified by EDX is set to 1
//                          // if zero flag is set, the 8 bit memory at the
//                          //  address specified by EDX is set to 0
//                          //  32 bit address mode only
//  RSI [R]  8BIT  NE SET,  // if zero flag is clear, the 8 bit memory at the
//                          //  address specified by RSI is set to 1
//                          // if zero flag is set, the 8 bit memory at the
//                          //  address specified by RSI is set to 0
//                          //  64 bit address mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrclcomma ( RCL, )
//
// C prototype:
//  void dg_forthrclcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist targetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   CL R
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   1                            this is the shift size in bits
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.       
//                                  
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode 
//   sequence for an x86 RCL instruction. This opcode sequence rotates the target
//   left through the carry flag bit the number of times specified in the mode.
//   (The bits shifted off the left are shifted into the carry flag, then
//   shifted back in from the right.)
//  If the target data size is 64 bits, the lower 6 bits of the shift count
//   mode parameter is used, otherwise the lower 5 bits are used.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  RCL,              // rotates EAX left twice through the carry flag
//  CL  EBX  RCL,               // rotates EBX left CL times through the carry
//                              //  flag
//  7 N  EDX [R]  32BIT RCL,    // rotates the 32 bit memory at the address
//                              //  specified by EDX left 7 times through the
//                              //  carry flag
//                              //  32 bit address mode only
//  7 N  RDI [R]  32BIT RCL,    // rotates the 32 bit memory at the address
//                              //  specified by RDI left 7 times through the
//                              //  carry flag
//                              //  64 bit address mode only
//  3 N  RCX  RCL,              // rotates RCX left 3 times through the carry flag
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrcrcomma ( RCR, )
//
// C prototype:
//  void dg_forthrcrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist targetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   CL R
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   1                            this is the shift size in bits
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 RCR instruction. This opcode sequence rotates the target
//   right through the carry flag bit the number of times specified in the mode.
//   (The bits shifted off the right are shifted into the carry flag, then
//   shifted back in from the left.)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  RCR,              // rotates EAX left twice through the carry flag
//  CL  EBX  RCR,               // rotates EBX left CL times through the carry
//                              //  flag
//  7 N  EDX [R]  32BIT RCR,    // rotates the 32 bit memory at the address
//                              //  specified by EDX left 7 times through the
//                              //  carry flag
//                              //  32 bit address mode only
//  7 N  RDI [R]  32BIT RCR,    // rotates the 32 bit memory at the address
//                              //  specified by RDI right 7 times through the
//                              //  carry flag
//                              //  64 bit address mode only
//  3 N  RCX  RCL,              // rotates RCX right 3 times through the carry flag
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrolcomma ( ROL, )
//
// C prototype:
//  void dg_forthrolcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist targetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   CL R
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   1                            this is the shift size in bits
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 ROL instruction. This opcode sequence rotates the target
//   left the number of times specified in the mode. (The bits shifted off the
//   left are shifted in from the right.)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  ROL,              // rotates EAX left twice
//  CL  EBX  ROL,               // rotates EBX left CL times
//  7 N  EDX [R]  32BIT ROL,    // rotates the 32 bit memory at the address
//                              //  specified by EDX left 7 times
//                              //  32 bit addressing mode only
//  2 N  RCX  ROL,              // rotates RCX left twice
//                              //  64 bit addressing mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrorcomma ( ROR, )
//
// C prototype:
//  void dg_forthrorcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist targetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   CL R
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   1                            this is the shift size in bits
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 ROR instruction. This opcode sequence rotates the target
//   right the number of times specified in the mode. (The bits shifted off the
//   right are shifted in from the left.)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  ROR,              // rotates EAX right twice
//  CL  EBX  ROR,               // rotates EBX right CL times
//  7 N  EDX [R]  32BIT ROR,    // rotates the 32 bit memory at the address
//                              //  specified by EDX right 7 times
//                              //  32 bit addressing mode only
//  2 N  RCX  ROR,              // rotates RCX right twice
//                              //  64 bit addressing mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrorxcomma ( RORX, )
//
// C prototype:
//  void dg_forthrorxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist srctargetparameterlist desttargetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain this mode specifiers:
//   immediatevalue N
//
//  The desttargetparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  The srctargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the srctargetparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  You do not need to set the data size for memory targets. But if
//   you want to, it must be the same size as the source register target
//   and be one of these:
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. 
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 RORX instruction. This opcode sequence rotates the target
//   right the number of times specified in the mode but does not alter any flags.
//   (The bits shifted off the right are shifted in from the left.)
//   Then the result is put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  ECX  RORX,        // rotates EAX right twice and puts the result
//                              //  into ECX
//  7 N  RDX [R]  EAX  RORX,    // rotates the 32 bit memory at the address
//                              //  in RDX right 7 times and puts the result
//                              //  into EAX
//  2 N  RCX  R8  RORX,         // rotates RCX right twice and puts the result
//                              //  into R8
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshlcomma ( SAL, SHL, )
//
// C prototype:
//  void dg_forthshlcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist targetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   CL R
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   1                            this is the shift size in bits
//
//  Description of target parameters:
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 SAL instruction. This opcode sequence shifts the target
//   left the number of times specified in the mode. Zeros are shifted in from
//   the right.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  SHL,              // shifts EAX left twice
//  CL  EBX  SHL,               // shifts EBX left CL times
//  7 N  EDX [R]  32BIT SHL,    // shifts the 32 bit memory at the address
//                              //  specified by EDX left 7 times
//                              //  32 bit addressing mode only
//  2 N  RCX  SHL,              // shifts RCX left twice
//                              //  64 bit addressing mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshlxcomma ( SHLX, )
//
// C prototype:
//  void dg_forthshlxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist srctargetparameterlist desttargetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain this mode specifier:
//   targetregister
//
//  The desttargetparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  The srctargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the srctargetparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  You do not need to set the data size for memory targets. But if
//   you want to, it must be the same size as the source register target
//   and be one of these:
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 SHLX instruction. This opcode sequence shifts the target
//   left the number of times specified in the mode. Zeros are shifted in from
//   the right. This opcode sequence does not alter any flags. Then the result
//   is put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EDX  EAX  ECX  SHLX,        // logically shifts EAX left and puts
//                              //  the result into ECX. The number of bits
//                              //  shifted is the value of EDX.
//  ECX  RDX [R]  EAX  SHLX,    // logically shifts the 32 bit memory at
//                              //  the address in RDX left and puts the result
//                              //  into EAX. The number of bits shifted is the
//                              //  value of ECX
//  RAX  RCX  R8  SHLX,         // logically shifts RCX left and puts the
//                              //  result into R8. The number of bits shifted
//                              //  is the value of RAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshrcomma ( SHR, )
//
// C prototype:
//  void dg_forthshrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist targetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   CL R
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   1                            this is the shift size in bits
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 SHR instruction. This opcode sequence shifts the target
//   right the number of times specified in the mode. Zeros are shifted in from
//   the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  SHR,              // shifts EAX right twice
//  CL  EBX  SHR,               // shifts EBX right CL times
//  7 N  EDX [R]  32BIT SHR,    // shifts the 32 bit memory at the address
//                              //  specified by EDX right 7 times
//                              //  32 bit addressing mode only
//  4 N  RCX  SHR,              // shifts RCX right twice
//                              //  64 bit addressing mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshrxcomma ( SHRX, )
//
// C prototype:
//  void dg_forthshrxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist srctargetparameterlist desttargetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain this mode specifier:
//   targetregister
//
//  The desttargetparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  The srctargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the srctargetparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  You do not need to set the data size for memory targets. But if
//   you want to, it must be the same size as the source register target
//   and be one of these:
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 SHRX instruction. This opcode sequence shifts the target
//   right the number of times specified in the mode. Zeros are shifted in from
//   the left. This opcode sequence does not alter any flags. Then the result
//   is put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EDX  EAX  ECX  SHRX,        // logically shifts EAX right and puts
//                              //  the result into ECX. The number of bits
//                              //  shifted is the value of EDX.
//  ECX  RDX [R]  EAX  SHRX,    // logically shifts the 32 bit memory at
//                              //  the address in RDX right and puts the result
//                              //  into EAX. The number of bits shifted is the
//                              //  value of ECX
//  RAX  RCX  R8  SHRX,         // logically shifts RCX right and puts the
//                              //  result into R8. The number of bits shifted
//                              //  is the value of RAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsarcomma ( SAR, )
//
// C prototype:
//  void dg_forthsarcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist targetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the mode:
//   CL R
//   immediatevalue minimumimmediatesize IMMEDIATE
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   1                            this is the shift size in bits
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 SAR instruction. This opcode sequence shifts the target 
//   right the number of times specified in the mode. The value of the highest
//   bit value is extended from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  2 N  EAX  SAR,              // shifts EAX right twice, extending the sign
//                              //  bit from the left
//  CL  EBX  SAR,               // shifts EBX right CL times, extending the sign
//                              //  bit from the left
//  7 N  EDX [R]  32BIT SAR,    // shifts the 32 bit memory at the address
//                              //  specified by EDX right 7 times, extending
//                              //  the sign bit from the left
//                              //  32 bit addressing mode only
//  3 N  RDX  SAR,              // shifts RDX right twice, extending the sign
//                              //  bit from the left. 64 bit addressing mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsarxcomma ( SARX, )
//
// C prototype:
//  void dg_forthsarxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist srctargetparameterlist desttargetparameterlist -- )
//
// Data stack in:
//  modeparameterlist             parameter list for the mode
//                                 which is the number of times to shift the target
//  targetparameterlist           parameter list for the target
//
//  The mode parameter list can contain this mode specifier:
//   targetregister
//
//  The desttargetparameterlist can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  The srctargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the srctargetparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  You do not need to set the data size for memory targets. But if
//   you want to, it must be the same size as the source register target
//   and be one of these:
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls the target and the mode from the data stack then compiles the opcode
//   sequence for an x86 SARX instruction. This opcode sequence shifts the target
//   right the number of times specified in the mode. The value of the highest
//   bit value is extended from the left. This opcode sequence does not alter any
//   flags. Then the result is put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EDX  EAX  ECX  SARX,        // arithmetically shifts EAX right and puts
//                              //  the result into ECX. The number of bits
//                              //  shifted is the value of EDX.
//  ECX  RDX [R]  EAX  SARX,    // arithmetically shifts the 32 bit memory at
//                              //  the address in RDX right and puts the result
//                              //  into EAX. The number of bits shifted is the
//                              //  value of ECX
//  RAX  RCX  R8  SARX,         // arithmetically shifts RCX right and puts the
//                              //  result into R8. The number of bits shifted
//                              //  is the value of RAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthnopcomma ( NOP, )
//
// C prototype:
//  void dg_forthnopcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 NOP instruction.
//  This opcode sequence does nothing. NOP is short for 'no operation'.'
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  NOP,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaltooutdxcomma ( AL->OUT[DX], )
//
// C prototype:
//  void dg_forthaltooutdxcomma (Bufferhandle* pBHarrayhead) 
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 OUT DX, AL instruction.
//  This opcode sequence sends the value in AL out i/o port DX.
//
// Note:
//  I did not test this instruction. ;-)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AL->OUT[DX],
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaxtooutdxcomma ( AX->OUT[DX], )
//
// C prototype:
//  void dg_forthaxtooutdxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 OUT DX, AX instruction.
//  This opcode sequence sends the value in AX out i/o port DX.
//
// Note:
//  I did not test this instruction. ;-)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AX->OUT[DX],
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_fortheaxtooutdxcomma ( EAX->OUT[DX], )
//
// C prototype:
//  void dg_fortheaxtooutdxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 OUT DX, EAX instruction.
//  This opcode sequence sends the value in EAX out i/o port DX.
//
// Note:
//  I did not test this instruction. ;-)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  EAX->OUT[DX],
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthindxtoalcomma ( IN[DX]->AL, )
//
// C prototype:
//  void dg_forthindxtoalcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 IN AL, DX instruction.
//  This opcode sequence loads the value at i/o port DX into AL.
//
// Note:
//  I did not test this instruction. ;-)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  IN[DX]->AL,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthindxtoalcomma ( IN[DX]->AX, )
//
// C prototype:
//  void dg_forthindxtoaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 IN AL, DX instruction.
//  This opcode sequence loads the value at i/o port DX into AX.
//
// Note:
//  I did not test this instruction. ;-)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  IN[DX]->AX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthindxtoeaxcomma ( IN[DX]->EAX, )
//
// C prototype:
//  void dg_forthindxtoeaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 IN EAX, DX instruction.
//  This opcode sequence loads the value at i/o port DX into EAX.
//
// Note:
//  I did not test this instruction. ;-)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  IN[DX]->EAX,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthretcomma ( RET, )
//
// C prototype:
//  void dg_forthretcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 RET instruction. This instruction is
//   a return from subroutine, which is basically EIP POP, in 32 bit address mode
//   or RIP POP, in 64 bit address mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  RET,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthretdropn16comma ( RETDROPN16, )
//
// C prototype:
//  void dg_forthretdropn16comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( u16 -- )
//
//  Description of target parameters:
//
//   u16                          number of bytes to drop from the return stack
//                                 u is limited to the range of an unsigned 16
//                                 bit integer
//
// Execute state action:
//  Compiles the opcode sequence for an x86 RET and drop N16 from the return stack 
//   instruction. This instruction is a return from subroutine, which is basically 
//   RIP POP, followed by  u N  RSP  ADD,  in 64 bit mode.  In 32 bit mode, it
//   does EIP POP, followed by  u N  ESP  ADD,  .
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  20 RETDROPN16,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthretfarcomma ( RETFAR, )
//
// C prototype:
//  void dg_forthretfarcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 far RET instruction. This instruction 
//   is a return from far subroutine, which is basically EIP POP, in 32 bit 
//   address mode or RIP POP, in 64 bit address mode. It also pops the CS register, 
//   and depending on the operating mode of the CPU, if the CS register is the 
//   index of a segement selector, it copies the CPL (current privilege level) 
//   field of the segment selector to the segment selector's RPL (requested 
//   privilege level) field. Then it pops the RSP and SS registers. And during all 
//   this it does numerous security checks to make sure your task is allowed to do 
//   this stuff. I recommend reading the official docs for a better description.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  RETFAR,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthretfardropn16comma ( RETFARDROPN16, )
//
// C prototype:
//  void dg_forthretfardropn16comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( u16 -- )
//
//  Description of target parameters:
//
//   u16                          number of bytes to drop from the return stack
//                                 u is limited to the range of an unsigned 16
//                                 bit integer
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 far RET instruction. This instruction 
//   is a return from far subroutine, which is basically EIP POP, in 32 bit 
//   address mode or RIP POP, in 64 bit address mode. It also pops the CS register, 
//   and depending on the operating mode of the CPU, if the CS register is the 
//   index of a segement selector, it copies the CPL (current privilege level) 
//   field of the segment selector to the segment selector's RPL (requested 
//   privilege level) field. Then it pops the RSP and SS registers. And during all 
//   this it does numerous security checks to make sure your task is allowed to do 
//   this stuff. I recommend reading the official docs for a better description.
//   After all this it drops u bytes from the return stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  48 RETFAR,
//
// Note:
//  This instruction is used with far calls using call gates that have some data
//   associated with them. Intel docs say u16 has to match the word count value 
//   used in the call gate.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstaccomma ( STAC, )
//
// C prototype:
//  void dg_forthstaccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 STAC instruction. This instruction
//    sets the alignment checking flag bit.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  STAC,
//
// Note:
//  This is a privelege level 0 instruction among other things.
//  Attempts to use this instruction at a privelege level > 0 causes an
//  undefined opcode exception.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstccomma ( STC, )
//
// C prototype:
//  void dg_forthstccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 STC instruction. This instruction
//    sets the carry flag bit.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  STC,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsticomma ( STI, )
//
// C prototype:
//  void dg_forthsticomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 STI instruction. This instruction
//   sets the interrupt flag bit. This enables interrupts after the next
//   instruction. (They do this so you can do a RET, before interrupts are
//   re-enabled.)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  STI,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstdcomma ( STD, )
//
// C prototype:
//  void dg_forthstdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 STD instruction. This instruction
//   sets the direction flag bit. When the direction bit is set, ESI and EDI
//   decrement during string operations. Some operating systems require the
//   direction bit to be left to forward (clear) when passing control to
//   operating system routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  STD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthswapgscomma ( SWAPGS, )
//
// C prototype:
//  void dg_forthswapgscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 SWAPGS instruction. This instruction
//    swaps the GS base register with the value in MSR address 0xC0000102.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  SWAPGS,
//
// Note:
//  64 bit address mode only instruction.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsysretto32comma ( SYSRETTO32, )
//
// C prototype:
//  void dg_forthsysretto32comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 SYSRETTO32 instruction. This instruction
//    returns to compatibility mode from a fast system call.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  SYSRETTO32,
//
// Note:
//  64 bit address mode only instruction.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsysretto64comma ( SYSRETTO64, )
//
// C prototype:
//  void dg_forthsysretto64comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 SYSRETTO64 instruction. This instruction
//    returns to 64 bit mode from a fast system call.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  SYSRETTO64,
//
// Note:
//  64 bit address mode only instruction.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsahfcomma ( SAHF, AH->EFLAGS, )
//
// C prototype:
//  void dg_forthsahfcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode sequence for an x86 LAHF instruction. This instruction
//   copies the AH register into the ower 8 bits of the flag register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AH->EFLAGS,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlarcomma ( LAR, )
//
// C prototype:
//  void dg_forthlarcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LAR instruction. This opcode sequence loads the access rights from
//   the segment descriptor in the source to the destination. Data size is
//   ignored for this instruction. You can specify direction, but memory
//   targets can not be the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]   ECX   LAR,       // LAR([RAX][15:0]) -> CX[15:0]
//  RAX       ECX   LAR,       // LAR(RAX[31:0]) -> ECX[31:0]
//
// Note:
//  Memory targets are 16 bits. If the source is memory, only 16 bits are
//   changed, otherwise 32 bits are changed.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlddqucomma ( LDDQU, )
//
// C prototype:
//  void dg_forthlddqucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LDDQU instruction. This opcode sequence copies a 128 bit value
//   from the source memory target to an xmm register destination. The source
//   memory target does not need to be aligned to a 16/31 byte boundary like
//   other memory to xmm register opcode sequences. Data size is ignored.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   RAX [R]  XMM0  LDDQU,  // [RAX][127:0] -> XMM0
//
// Note:
//  The source must be memory and the destination must be an xmm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvlddqucomma ( VLDDQU, )
//
// C prototype:
//  void dg_forthvlddqucomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VLDDQU instruction. This opcode sequence copies a 128 or 256 bit 
//   value from the source memory target to an xmm or ymm register destination. 
//   The  source memory target does not need to be aligned to a 16/31 byte boundary 
//   like other memory to xmm or ymm register opcode sequences.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   RAX [R]  XMM0  VLDDQU,  // [RAX][127:0] -> XMM0
//
//   RAX [R]  YMM0  VLDDQU,  // [RAX][255:0] -> YMM0
//
// Note:
//  The source must be memory and the destination must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthldmxcsrcomma ( LDMXCSR, )
//
// C prototype:
//  void dg_forthldmxcsrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 LDMXCSR instruction. This opcode sequence copies the 32 bit value
//   from the source to the MXCSR register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  MXCSR,          //  [RAX] -> MXCSR
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvldmxcsrcomma ( VLDMXCSR, )
//
// C prototype:
//  void dg_forthvldmxcsrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 VLDMXCSR instruction. This opcode sequence copies the 32 bit value
//   from the source to the MXCSR register. Memory target size is ignored.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  VLDMXCSR,          //  [RAX] -> MXCSR
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthldscomma ( LDS, )
//
// C prototype:
//  void dg_forthldscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LDS instruction. This opcode sequence moves a 16 bit segment
//   and 32 bit offset from the source memory target to the destination.
//   The segment selector is moved to the DS register, and the 32 bit offset
//   is moved to the destination register. Data size and register size are
//   ignored for this instruction. This opcode sequence is valid only in 32
//   bit address mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  RAX [R]   ECX   LDS,     // [RAX][?] -> ECX[31:0]
//                           // [RAX][?] -> DS[15:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlescomma ( LES, )
//
// C prototype:
//  void dg_forthlescomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LES instruction. This opcode sequence moves a 16 bit segment
//   and 32 bit offset from the source memory target to the destination.
//   The segment selector is moved to the ES register, and the 32 bit offset
//   is moved to the destination register. Data size and register size are
//   ignored for this instruction. This opcode sequence is valid only in 32
//   bit address mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  RAX [R]   ECX   LES,     // [RAX][?] -> ECX[31:0]
//                           // [RAX][?] -> ES[15:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlfscomma ( LFS, )
//
// C prototype:
//  void dg_forthlfscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LFS instruction. This opcode sequence moves a 16 bit segment
//   and 32 bit or 32 bit offset from the source memory target to the destination.
//   The segment selector is moved to the FS register, and the offset
//   is moved to the destination register. Memory target size is ignored for this
//   instruction. The data size for this instruction is determined by the
//   register target. So, if you are in 32 bit mode, use a 32 bit register and
//   this compiling word will not compile the rex.w prefix. If you are in
//   64 bit mode, you probably want the 64 bit offset, so use a 64 bit register
//   and this compiling word will compile the rex.w prefix. If you use a 32
//   bit register in 64 bit address mode, no rex.w prefix is compiled and the
//   offset copied is 32 bits.
/
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  RAX [R]   ECX   LFS,     // [RAX][?] -> ECX[31:0]
//                           // [RAX][?] -> FS[15:0]
//
// 64 bit address mode example:
//  RAX [R]   RCX   LFS,     // [RAX][?] -> RCX[63:0]
//                           // [RAX][?] -> FS[15:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlgscomma ( LGS, )
//
// C prototype:
//  void dg_forthlgscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LGS instruction. This opcode sequence moves a 16 bit segment
//   and 32 bit or 32 bit offset from the source memory target to the destination.
//   The segment selector is moved to the FS register, and the offset
//   is moved to the destination register. Memory target size is ignored for this
//   instruction. The data size for this instruction is determined by the
//   register target. So, if you are in 32 bit mode, use a 32 bit register and
//   this compiling word will not compile the rex.w prefix. If you are in
//   64 bit mode, you probably want the 64 bit offset, so use a 64 bit register
//   and this compiling word will compile the rex.w prefix. If you use a 32
//   bit register in 64 bit address mode, no rex.w prefix is compiled and the
//   offset copied is 32 bits.
/
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  RAX [R]   ECX   LGS,     // [RAX][?] -> ECX[31:0]
//                           // [RAX][?] -> FS[15:0]
//
// 64 bit address mode example:
//  RAX [R]   RCX   LGS,     // [RAX][?] -> RCX[63:0]
//                           // [RAX][?] -> GS[15:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlsscomma ( LSS, )
//
// C prototype:
//  void dg_forthlsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  The parameter list for targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how targetxparameterlist is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after targetxparameterlist
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 LSS instruction. This opcode sequence moves a 16 bit segment
//   and 32 bit or 32 bit offset from the source memory target to the destination.
//   The segment selector is moved to the FS register, and the offset
//   is moved to the destination register. Memory target size is ignored for this
//   instruction. The data size for this instruction is determined by the
//   register target. So, if you are in 32 bit mode, use a 32 bit register and
//   this compiling word will not compile the rex.w prefix. If you are in
//   64 bit mode, you probably want the 64 bit offset, so use a 64 bit register
//   and this compiling word will compile the rex.w prefix. If you use a 32
//   bit register in 64 bit address mode, no rex.w prefix is compiled and the
//   offset copied is 32 bits.
/
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  RAX [R]   ECX   LSS,     // [RAX][?] -> ECX[31:0]
//                           // [RAX][?] -> SS[15:0]
//
// 64 bit address mode example:
//  RAX [R]   RCX   LSS,     // [RAX][?] -> RCX[63:0]
//                           // [RAX][?] -> SS[15:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsyscallcomma ( SYSCALL, )
//
// C prototype:
//  void dg_forthsyscallcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  64 bit address mode only.
//  Compiles the opcode sequence for an x86 SYSCALL instruction.
//
// Compile state action.
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  SYSCALL,
//  On Mac OS X, you load RAX with the system function you want to call,
//   and load the registers for a normal System V ABI call.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrepcomma ( REP, )
//
// C prototype:
//  void dg_forthrepcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for an x86 REP instruction prefix.
//  This prefix is used with INSx MOVSx OUTSx LODSx and STOSx to repeat the
//   instruction ECX times in 32 bit address mode and RCX times in 64 bit
//   address mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit address mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REP, MOVSB,        // copies count bytes from src to dest
//
// 64 bit address mode example:
//  src  RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP, MOVSB,        // copies count bytes from src to dest
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrepecomma ( REPE, NZUNTILREP, ) ( not added yet ZSWHILEREP, )
//
// C prototype:
//  void dg_forthrepecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for an x86 REPE instruction prefix.
//  This prefix is used with CMPSB, CMPSW, CMPSD, CMPSQ,
//   or SCASB, SCASW, SCASD, SCASQ, to repeat the instruction
//   ECX times in 32 bit address mode, RCX times in 64 bit address mode,
//   or until zero flag bit is clear.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit addressing mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  NZUNTILREP,  CMPSB,  compares count bytes at src with dest until
//                        a non match is found
//
// 64 bit addressing mode example:
//  src  RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  NZUNTILREP,  CMPSB,  compares count bytes at src with dest until
//                        a non match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrepnecomma ( REPNE, ZSUNTILREP, ) ( not added yet NZWHILEREP, )
//
// C prototype:
//  void dg_forthrepnecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for an x86 REPNE instruction prefix.
//  This prefix is used with CMPSB, CMPSW, CMPSD, CMPSQ,
//   or SCASB, SCASW, SCASD, SCASQ, to repeat the instruction
//   ECX times in 32 bit address mode, RCX times in 64 bit address mode,
//   or until zero flag bit is clear.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  ZSUNTILREP,  CMPSB,  compares count bytes at src with dest until
//                        a match is found
//
// 64 bit addressing mode example:
//  src  RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  ZSUNTILREP,  CMPSB,  compares count bytes at src with dest until
//                        a match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsbcomma ( MOVSB, )
//
// C prototype:
//  void dg_forthmovsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 MOVSB instruction.
//  In 32 bit addressing mode:
//   Moves an 8 bit value from [ESI] to [EDI] and decrements ECX.
//   EDI and ESI are adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Moves an 8 bit value from [RSI] to [RDI] and decrements RCX.
//   RDI and RSI are adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit addressing mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REP, MOVSB,        // copies count bytes from src to dest
//
// 64 bit addressing mode example:
//  src RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP, MOVSB,        // copies count bytes from src to dest
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//   If you are using the DiaperGlu standard frame, this is done for you.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovswcomma ( MOVSW, )
//
// C prototype:
//  void dg_forthmovswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for an x86 MOVSW instruction prefix.
//  In 32 bit addressing mode:
//   Moves a 16 bit value from [ESI] to [EDI] and decrements ECX.
//   EDI and ESI are adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//  In 64 bit addressing mode:
//   Moves a 16 bit value from [RSI] to [RDI] and decrements RCX.
//   RDI and RSI are adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit addressing mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REP, MOVSW,        // copies count uint16s from src to dest
//
// 64 bit addressing mode example:
//  src RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP, MOVSW,        // copies count uint16s from src to dest
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsdcomma ( MOVSD, )
//
// C prototype:
//  void dg_forthmovsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 MOVSD instruction.
//  In 32 bit addressing mode:
//   Moves an 32 bit value from [ESI] to [EDI] and decrements ECX.
//   EDI and ESI are adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//  In 64 bit addressing mode:
//   Moves an 32 bit value from [RSI] to [RDI] and decrements RCX.
//   RDI and RSI are adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 32 bit addressing mode example:
//  src  ESI  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REP, MOVSD,        // copies count uint32s from src to dest
//
// 64 bit addressing mode example:
//  src RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP, MOVSD,        // copies count uint32s from src to dest
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsqcomma ( MOVSQ, )
//
// C prototype:
//  void dg_forthmovsqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for an x86 MOVSQ instruction prefix.
//  Not supported in 32 bit addressing mode.
//  In 64 bit addressing mode:
//   Moves an 64 bit value from [RSI] to [RDI] and decrements RCX.
//   RDI and RSI are adjusted according to the direction flag,
//    clear adds 8 and set subtracts 8.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit addressing mode example:
//  src RSI  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP, MOVSQ,        // copies count uint64s from src to dest
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstosbcomma ( STOSB, )
//
// C prototype:
//  void dg_forthstosbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 STOSB instruction.
//  In 32 bit addressing mode:
//   Stores the 8 bit value in AL to [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Stores the 8 bit value in AL to [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addresding mode example:
//  value  AL  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REP,  STOSB,  // fills count bytes at dest with value
//
// 64 bit addresding mode example:
//  value  AL  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP,  STOSB,  // fills count bytes at dest with value
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstosdcomma ( STOSD, )
//
// C prototype:
//  void dg_forthstosdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 STOSD instruction.
//  In 32 bit addressing mode:
//   Stores the 32 bit value in EAX to [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//  In 64 bit addressing mode:
//   Stores the 32 bit value in EAX to [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addressing mode example:
//  value  EAX  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REP,  STOSD,  // fills count 32 bit units at dest with value
//
// 64 bit addressing mode example:
//  value  EAX  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP,  STOSD,  // fills count 32 bit units at dest with value
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstoswcomma ( STOSW, )
//
// C prototype:
//  void dg_forthstoswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 STOSW instruction.
//  In 32 bit addressing mode:
//   Stores the 16 bit value in AX to [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//  In 64 bit addressing mode:
//   Stores the 16 bit value in AX to [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addressing mode example:
//  value  AX  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REP,  STOSW,  // fills count 16 bit units at dest with value
//
// 64 bit addressing mode example:
//  value  AX  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP,  STOSW,  // fills count 16 bit units at dest with value
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstosqcomma ( STOSQ, )
//
// C prototype:
//  void dg_forthstosqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 STOSQ instruction.
//  Not available in 32 bit addressing mode.
//  In 64 bit addressing mode:
//   Stores the 64 bit value in RAX to [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear adds 8 and set subtracts 8.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 64 bit addressing mode example:
//  value  RAX  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REP,  STOSQ,  // fills count 64 bit units at dest with value
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthscasbcomma ( SCASB, )
//
// C prototype:
//  void dg_forthscasbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 SCASB instruction.
//  In 32 bit addressing mode:
//   Decrements ECX and compares the 8 bit value in [EDI] with AL.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Decrements RCX and compares the 8 bit value in [RDI] with AL.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addressing mode example:
//  value  AL  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REPNZUNTIL,  STOSB,  // compares bytes at dest with value until a non match
//                       //  is found
//
// 64 bit addressing mode example:
//  value  AL  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REPNZUNTIL,  STOSB,  // compares bytes at dest with value until a non match
//                       //  is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthscasdcomma ( SCASD, )
//
// C prototype:
//  void dg_forthscasdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 SCASD instruction.
//  In 32 bit addressing mode:
//   Decrements ECX and compares the 32 bit value in [EDI] with EAX.
//   EDI is adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//  In 64 bit addressing mode:
//   Decrements RCX and compares the 32 bit value in [RDI] with EAX.
//   RDI is adjusted according to the direction flag,
//    clear adds 4 and set subtracts 4.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addressing mode example:
//  value  EAX  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REPNZUNTIL,  STOSD,  // compares 32 bit units at dest with value until
//                       //  a non match is found
//
// 64 bit addressing mode example:
//  value  EAX  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REPNZUNTIL,  STOSD,  // compares 32 bit units at dest with value until
//                       //  a non match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthscaswcomma ( SCASW, )
//
// C prototype:
//  void dg_forthscaswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 SCASW instruction.
//  In 32 bit addressing mode:
//   Decrements ECX and compares the 16 bit value in [EDI] with AX.
//   EDI is adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//  In 64 bit addressing mode:
//   Decrements RCX and compares the 16 bit value in [RDI] with AX.
//   RDI is adjusted according to the direction flag,
//    clear adds 2 and set subtracts 2.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 32 bit addressing mode example:
//  value  AX  MOV,
//  dest  EDI  MOV,
//  count  ECX  MOV,
//  REPNZUNTIL,  STOSD,  // compares 16 bit units at dest with value until
//                       //  a non match is found
//
// 64 bit addressing mode example:
//  value  AX  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REPNZUNTIL,  STOSD,  // compares 16 bit units at dest with value until
//                       //  a non match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthscasqcomma ( SCASQ, )
//
// C prototype:
//  void dg_forthscasqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode byte for the x86 SCASQ instruction.
//  Not supported in 32 bit addressing mode.
//  In 64 bit addressing mode:
//   Decrements RCX and compares the 64 bit value in [RDI] with RAX.
//   RDI is adjusted according to the direction flag,
//    clear adds 8 and set subtracts 8.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//
// 64 bit addressing mode example:
//  value  RAX  MOV,
//  dest  RDI  MOV,
//  count  RCX  MOV,
//  REPNZUNTIL,  STOSQ,  // compares 64 bit units at dest with value until
//                       //  a non match is found
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmwaitcomma ( MWAIT, )
//
// C prototype:
//  void dg_forthmwaitcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 MWAIT instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  MWAIT,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpausecomma ( PAUSE, )
//
// C prototype:
//  void dg_forthpausecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 PAUSE instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  PAUSE,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpopadcomma ( POPAD, )
//
// C prototype:
//  void dg_forthpopadcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  In 32 bit addressing mode:
//   Compiles the opcode string for the x86 POPAD instruction.
//   This instruction pops EDI, ESI, EBP, pop to void,
//     EBX, EDX, ECX, and EAX in that order
//  Not supported in 64 bit addressing mode:
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  POPAD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpopfdcomma ( POPF, POPFD, POPFQ, )
//
// C prototype:
//  void dg_forthpopfdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 POPFD POPFQ instruction.
//  In 32 bit addressing mode, this instruction pops the eflags register.
//  In 64 bit addressing mode, this instruction pops the rflags register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  POPFD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpushadcomma ( PUSHAD, )
//
// C prototype:
//  void dg_forthpushadcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  In 32 bit addressing mode:
//   Compiles the opcode string for the x86 PUSHAD instruction.
//   This instruction pushes EAX, ECX, EDX, EBX, preinstruction ESP, EBP, ESI,
//    and EDI in that order.
//  Not supported in 64 bit addressing mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  PUSHAD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpushfdcomma ( PUSHF, PUSHFD, PUSHFQ, )
//
// C prototype:
//  void dg_forthpushfdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 PUSHFD PUSHFQ instruction.
//   In 32 bit mode this instruction pushes the eflags register.
//   In 64 bit mode this instruction pushes the rflags register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  PUSHFD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrsmcomma ( RSM, )
//
// C prototype:
//  void dg_forthrsmcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 RSM instruction. This opcode sequence
//   causes the processor to resume from system management mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  RSM,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsysentercomma ( SYSENTER, )
//
// C prototype:
//  void dg_forthsysentercomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 SYSENTER instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  SYSENTER,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsysexitcomma ( SYSEXIT, )
//
// C prototype:
//  void dg_forthsysexitcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 SYSEXIT instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  SYSEXIT,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthwaitcomma ( WAIT, )
//
// C prototype:
//  void dg_forthwaitcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 WAIT instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  WAIT,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiretdcomma ( IRETD, )
//
// C prototype:
//  void dg_forthiretdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 IRETD instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  IRETD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiretqcomma ( IRETQ, )
//
// C prototype:
//  void dg_forthiretqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 IRETQ instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  IRETQ,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlahfcomma ( LAHF, EFLAGS->AH, )
//
// C prototype:
//  void dg_forthlahfcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 LAHF instruction. This instruction
//   copies the lower 8 bits of the flag register into the AH register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  EFLAGS->AH,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsfencecomma ( SFENCE, )
//
// C prototype:
//  void dg_forthsfencecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 SFENCE instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  SFENCE,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxlatbcomma ( XLATB, [RBX+AL]->AL, )
//
// C prototype:
//  void dg_forthxlatbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//                                                              
// Execute state action:
//  Compiles the opcode string for the x86 XLATB instruction.
//  In 32 bit address mode, this compiles the opcode sequence for code
//   that the intel docs says does: [DS:EBX + AL] -> AL
//  Inj 64 bit address mode, this compiles the opcode sequence for code
//   that does [RBX+AL]->AL
//
//  Note:
//   On the Mac OS X 64 bit operating system and any other operating system
//   that uses the System V ABI, RBX must be preserved through a subroutine call.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  XLATB,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxchgcomma ( XCHG, )
//
// C prototype:
//  void dg_forthxchgcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist          
//  targetyparameterlist          
//
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   currentcompilebufferoffset [O]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of these parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 XCHG instruction.
//   The memory target's value is exchanged with the register target's value.
//
// Note:
//  For exchanges, the behavior is the same regardless of which target is
//   the source and which is the destination; however if you want control over
//   how the instruction is encoded, then read on:
//  If you specify a register and a memory target, it doesn't matter
//   which one is target x or target y or what direction is used. 
//  If you specify two register targets, target x is used as the memory
//   target  and target y is used as the register target.
//  If you specify two register targets along with <- then target y is used
//   memory target x is used as the register target.
// 
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  AL  CL  XCHG,        // exchanges value in AL with value in CL
//  EBX  EDX  XCHG,      // exchanges value in EBX with value in EDX
//  EDX [R]  EAX  XCHG,  // exchanges 32 bit value in memory at address
//                       //  specified by EDX with value in EAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some 
//   strange things if you are not careful.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcode ( CODE )
//
// C prototype:
//  void dg_forthcode (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( "<delimiters>name<delimiters>morestuff"
//    -currentinputbuffer- "<delimiters>morestuff" )
//  ( -- executiontoken codesys )
//
// Current input buffer in:
//  "<delimiters>name<delimiters>morestuff"  
//                                after removing leading delimiters, all
//                                 characters up to the next delimiter are
//                                 taken as the subroutine ('name') to be
//                                  processed
//
// Current input buffer out:      
//  "<delimiters>morestuff" the current offset pointer for the
//                                current input buffer is moved to the delimiter 
//                                after the subroutine ('name') or to the end
//                                of the buffer if no word is found
//
// Action:
//  gets the current input buffer, 
//  then gets the next word from the input buffer
//  then creates a new dictionary definition header with the word as the name
//   of the word the dictionary definition header has a compile type of
//   compile call and the data buffer id and offset
//   points to the end of the current compile buffer
//  then pushes the new word's execute token to the data stack
//  then pushes the codesys marker to the data stack
//  then sets the state of the script processor to execute
//  clears the 'safe call' flag.'
//  then pushes the X86 assembler wordlist onto the end of the search order
//
// Note:
//  No code is compiled. This just assigns a label to the current offset in
//   the current compile buffer.
//   CODE END-CODE also checks to make sure the data stack is balanced when
//   END-CODE is done.
//   If you want to make subroutines with the standard Diaperglu frame, use
//   COMPILE-ENTER-FRAME at the beginning of your subroutines,
//   and COMPILE-EXIT-FRAME at each exit
//   from your subroutines.
//
//  The compile type is always compile, so if you use the subroutine name in
//   execute or compile mode the same thing happens, Diaperglu tries to compile
//   a call to the subroutine.
//  If the call originates from the same buffer as the function, a relative call
//   is compiled. If the call is from another buffer, an error is pushed. This
//   is a straight call. No parameters are pushed, nor is the stack aligned.
//   You are responsible for setting up the return stack  and/or registers
//   for the call.
//  If somehow you make a CODE routine with a data buffer of DG_CORE_BUFFERID,
//   a call to a fixed address is compiled. Note that the current calling
//   method trashes EAX in this case. Note that third party shared object
//   libraries and locked down buffers are loaded at fixed addresses, so you
//   could change the compile types of functions at these locations to OCRCALL
//   and use them in the same way as you would use a CODE routine from
//   this assembler.
//
//  Since subroutine names created with code are compiling words... it might be
//   good idea to name the subroutines like this:
//    myfunctionnameCALL,
//
//  Diaperglu 3.0:
//  Calls to other buffers except DG_CORE_BUFFERID are not supported at this time
//   because that would require Diaperglu to be aware of several things:
//   1) Can the target buffer move? If the target buffer's growby and maxsize
//    are the same then a call to a fixed address can be compiled. If not, then
//    the run time code needs a way to find the buffer's current address.'
//   2) Is the currently compiling subroutine using the standard Diaperglu frame?
//     If it is, then pBHarrayhead is available and the offset and bufferid of
//     the other buffer can be found at run time.
//   Functionality to check for these cases and automatically compile calls when
//    possible will likely be added in future versions of Diaperglu.
//
//
//  32 bit addressing mode examples:
//
//   CODE bumpeax,  EAX INC,  RET,  END-CODE
//
//   CODE dosomethingwitheax,
//    1234 N  EAX  MOV,
//    bumpeax,                  // compiles a relative call to bumpeax,
//    RET,
//   END-CODE
//
//
//  64 bit addressing mode examples:
//
//   CODE bumprax,  RAX INC,  RET,  END-CODE
//
//   CODE dosomethingwitheax,
//    1234 N  RAX  MOV,
//    bumpeax,                  // compiles a relative call to bumpeax,
//    RET,
//   END-CODE
//
//
//  How to use CODE to compile a Diaperglu script word:
//
//   CODE DUPDUPDUP
//    COMPILE-ENTER-FRAME  // Set up the standard Diaperglu subroutine frame.
//                         //  subroutine now has access to pBHarrayhead and
//                         //  can now call Diaperglu script functions
//
//     ] DUP DUP DUP [     // drop into compile mode and compile calls to
//                         //  script functions
//
//    COMPILE-EXIT-FRAME   // required to undo stuff from COMPILE-ENTER-FRAME
//                        //  also compiles RET, for you
//
//   END-CODE
//
//   OCREXECUTE ' DUPDUPDUP SETOCR  // change compile type to that of a regular
//                                  //  Diaperglu script command. Now you can
//                                  //  use this function name as a Diaperglu
//                                  //  script command.
//
//
//  Passing parameters for 32 bit addressing mode:
//
//   If you use COMPILE-ENTER-FRAME then
//    parameter 0 is at:
//     EBP 08 [R+N]
//    parameter 1 is at (in hex):
//     EBP 0C [R+N]
// 
//   If you do not use COMPILE-ENTER-FRAME then at the entry of your subroutine
//    parameter 0 is at:
//     ESP 4 [R+N]
//    parameter 1 is at:
//     ESP 8 [R+N]
//
//
// Forth standard:
//  
//                                                           
// NOTE: with the name parsing algorithm this routine uses, the name has to be
//  in the same input buffer as the :
//  so if you are entering lines from the console, it has to be on the same
//  line as :
//  but if you are loading from a file, it can be on a later line in the file
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthocode ( OCODE )
//
// C prototype:
//  void dg_forthocode (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( "<delimiters>name<delimiters>morestuff"
//    -currentinputbuffer- "<delimiters>morestuff" )
//  ( -- executiontoken codesys )
//
// Current input buffer in:
//  "<delimiters>name<delimiters>morestuff"  
//                                after removing leading delimiters, all
//                                 characters up to the next delimiter are
//                                 taken as the subroutine ('name') to be
//                                  processed
//
// Current input buffer out:      
//  "<delimiters>morestuff" the current offset pointer for the
//                                current input buffer is moved to the delimiter 
//                                after the subroutine ('name') or to the end
//                                of the buffer if no word is found
//
// Action:
//  gets the current input buffer, 
//  then gets the next word from the input buffer
//  then creates a new dictionary definition header with the word as the name
//   of the word the dictionary definition header has a compile type of
//   pushn where n is the current offset in the current compile buffer
//  then pushes the new word's execute token to the data stack
//  then pushes the codesys marker to the data stack
//  then sets the state of the script processor to execute
//  clears the 'safe call' flag.'
//  then pushes the X86 assembler wordlist onto the end of the search order
//
// Note:
//  No code is compiled. This just assigns a label to the current offset in
//   the current compile buffer.
//   OCODE END-CODE also checks to make sure the data stack is balanced when
//   END-CODE is done.
//   If you want to make subroutines with the standard Diaperglu frame, use
//   ENTER, at the beginning of your subroutines, and EXIT, at each exit
//   from your subroutines.
//
//  The default safe behavior errs on the side of caution. Only
//   compiling words and words that could be used to push to the current
//   compile buffer need to be marked as safe. Safe subroutine calls
//   execute slower but can handle the case where the buffer they are called
//   from relocates while they are executing.
//
//  The compile type is always compile, so if you use the subroutine name in
//   execute or compile mode the same thing happens, Diaperglu tries to compile
//   a call to the subroutine.
//  If the call originates from the same buffer as the function, a relative call
//   is compiled. If the call is from another buffer, an error is pushed. This
//   is a straight call. No parameters are pushed, nor is the stack aligned.
//   You are responsible for setting up the return stack  and/or registers
//   for the call.
//  If somehow you make a CODE routine with a data buffer of DG_CORE_BUFFERID,
//   a call to a fixed address is compiled. Note that the current calling
//   method trashes EAX in this case. Note that third party shared object
//   libraries and locked down buffers are loaded at fixed addresses, so you
//   could change the compile types of functions at these locations to OCRCALL
//   and use them in the same way as you would use a CODE routine from
//   this assembler.
//
//  Diaperglu 3.0:
//  Calls to other buffers except DG_CORE_BUFFERID are not supported at this time
//   because that would require Diaperglu to be aware of several things:
//   1) Can the target buffer move? If the target buffer's growby and maxsize
//    are the same then a call to a fixed address can be compiled. If not, then
//    the run time code needs a way to find the buffer's current address.'
//   2) Is the currently compiling subroutine using the standard Diaperglu frame?
//     If it is, then pBHarrayhead is available and the offset and bufferid of
//     the other buffer can be found at run time.
//   Functionality to check for these cases and automatically compile calls when
//    possible will likely be added in future versions of Diaperglu.
//  
//                                                           
// NOTE: with the name parsing algorithm this routine uses, the name has to be
//  in the same input buffer as the :
//  so if you are entering lines from the console, it has to be on the same
//  line as :
//  but if you are loading from a file, it can be on a later line in the file
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthendcode ( END-CODE )
//
// C prototype:
//  void dg_forthendcode (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( executiontoken colonsys -- )
//                                                             
// Action:
//  sets the script processing state to execute
//  checks to make sure the codesys from the CODE or OCODE is there
//  links the executiontoken's word into the current new word wordlist
//  if a safe call to a subroutine was compiled since CODE, the last
//   definition is marked as safe also, which is probably not what you
//   want the word to do... unless you just made a forth subroutine.
//
// Forth standard:
//  
//
//  Note: The default safe behavior errs on the side of caution. Only
//   compiling words and words that could be used to push to the current
//   compile buffer need to be marked as safe. Safe subroutine calls
//   execute slower but can handle the case where the buffer they are called
//   from relocates while they are executing.
//
// Failure cases:
//  error compiling return
//  error setting script processing state to execute
//  error getting pointer to the data stack
//  data stack underflow
//  data stack not balanced from : probably due to an unresolved branch
//  error getting current compile word list id
//  error linking definition to the current compile word list
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbegincomma ( BEGIN, )
//
// C prototype:
//  void dg_forthbegincomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- beginoffset )
//                                                             
// Action:
//  Pushes the current compile buffer offset onto the data stack.
//
// Note:
//  This word is used to mark the beginning of a loop.
//
// Examples:
//  BEGIN,
//   loop body
//  CS UNTIL,       // loops until the carry flag is set
//
//  BEGIN,
//   loop body a
//  ZS WHILE,       // exits the loop if the zero flag is NOT set
//   loop body b
//  REPEAT,
//
//  BEGIN,
//   loop body
//  LOOP,           // loops until the loop counter register is 0
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthuntilcomma ( UNTIL, )
//
// C prototype:
//  void dg_forthuntilcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( beginoffset conditioncode -- )
//
// Data stack in:
//  beginoffset                   offset in current compile buffer from when BEGIN,
//                                 was executed.
//
//  conditioncode                 0-15, x86 code for conditional instructions
//                                 is one of:
//                                  VS   for overflow set
//                                  NV   for overflow clear or no overflow
//                                  CS   for carry set
//                                  NC   for overflow clear or no carry
//                                  ULT  for unsigned less than
//                                  ULE  for unsigned less than or equal
//                                  UGT  for unsigned greater than
//                                  UGE  for unsigned greater than or equal
//                                  ZS   for zero set
//                                  NZ   for zero clear or no zero
//                                  EQ   for equal or zero set
//                                  NE   for not equal or zero clear
//                                  SS   for sign set
//                                  NS   for no sign or sign clear
//                                  MI   for minus or sign set
//                                  PL   for plus or sign clear
//                                  PS   for parity set
//                                  NP   for no parity or parity clear
//                                  LT   for signed less than
//                                  GE   for signed greater than or equal
//                                  LE   for signed less than or equal
//                                  GT   for signed greater than
//                                  ALWAYS
//                                  NEVER
//
// Action:
//  Compiles a resolved branch back to the beginoffset for the opposite condition
//   of the condition code. When executed the branch is taken when the condition
//   is NOT true.
//
// Examples:
//  BEGIN,
//   loop body
//  CS UNTIL,       // loops until the carry flag is set
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compileloopwhilecomma
//
// C prototype:
//  void dg_compileloopwhilecomma (
//   Bufferhandle* pBHarrayhead,
//   UINT64 baseopcode)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  UINT64 baseopcode             base opcode for the loop instruction.
//                                                             
// Action:
//  Compiles an unresolved loop branch. Then compiles an unresolved branch
//   always forward. Then resolves the loop branch to go to the current compile
//   buffer location. Then pushes the current compile buffer location to the
//   data stack and swaps the begin offset that was already there with the
//   new while offset.
//
// Note:
//  Branch range is limited to a signed 32 bit integer.
//
// 64 bit address mode example:
//  BEGIN,
//   loop body part a
//  LOOPNOTDONEWHILE,    // decrements RCX and stays in repeating code
//                       //  until RCX is 0
//   loop body part b
//  REPEAT,
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthloopwhilecomma ( LOOPNOTDONEWHILE, )
//
// C prototype:
//  void dg_forthloopwhilecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( beginoffset -- whileoffset beginoffset )
//                                                             
// Action:
//  Compiles an unresolved loop branch. Then compiles an unresolved branch
//   always forward. Then resolves the loop branch to go to the current compile
//   buffer location. Then pushes the current compile buffer location to the
//   data stack and swaps the begin offset that was already there with the
//   new while offset.
//  In 64 bit address mode when executed:
//   the RCX register is decremented and the loop branch
//   forward into the main part of the BEGIN, LOOPNOTDONEWHILE, REPEAT, loop is
//   taken until RCX is 0. If RCX is 0, the branch always is taken to the
//   code after the REPEAT,
//  32 bit address mode uses the EXC register.
//
// Note:
//  Branch range is limited to a signed 32 bit integer.
//
// 64 bit address mode example:
//  BEGIN,
//   loop body part a
//  LOOPNOTDONEWHILE,    // decrements RCX and stays in repeating code
//                       //  until RCX is 0
//   loop body part b
//  REPEAT,
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthzsorloopwhilecomma ( ZSORLOOPNOTDONEWHILE, EQORLOOPNOTDONEWHILE, )
//
// C prototype:
//  void dg_forthzsorloopwhilecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( beginoffset -- whileoffset beginoffset )
//                                                             
// Action:
//  Compiles an unresolved loop while zero branch. Then compiles an unresolved
//   branch always forward. Then resolves the loop while zero branch to go to
//   the current compile buffer location. Then pushes the current compile buffer 
//   location to the data stack and swaps the begin offset that was already there 
//   with the new while offset.
//  In 64 bit address mode when executed:
//   the RCX register is decremented and the loop branch
//   forward into the main part of the BEGIN, ZSORLOOPNOTDONEWHILE, REPEAT, loop
//    is taken until RCX is 0 or the zero flag is clear. Note that the decrement
//    of RCX does not alter the zero flag. If RCX is 0 or the zero flag is set,
//   the branch always is taken to the code after the REPEAT,
//
// Note:
//  Branch range is limited to a signed 32 bit integer.
//
// 64 bit address mode example:
//  BEGIN,
//   loop body part a
//  ZSORLOOPNOTDONEWHILE,    // decrements RCX and stays in repeating code
//                           //  until zero flag is clear or RCX is 0
//   loop body part b
//  REPEAT,
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthloopcomma ( LOOPDONEUNTIL, )
//
// C prototype:
//  void dg_forthloopcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( beginoffset -- )
//                                                             
// Action:
//  Compiles a resolved loop branch back to the beginoffset. When executed,
//  In 32 bit addressing mode when executed:
//   the ECX register is decremented and if ECX is not 0 then a branch back to
//   the BEGIN, is taken.
//  In 64 bit addressing mode when executed:
//   the RCX register is decremented and if RCX is not 0 then a branch back to
//   the BEGIN, is taken.
//
// Note:
//  The branch range is limited to a signed 32 bit integer.
//
// Examples:
//  BEGIN,
//   loop body
//  LOOPDONEUNTIL,          // loops until the loop counter register is 0
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthnzorloopwhilecomma ( NZORLOOPNOTDONEWHILE, NEORLOOPNOTDONEWHILE, )
//
// C prototype:
//  void dg_forthnzorloopwhilecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( beginoffset -- whileoffset beginoffset )
//                                                             
// Action:
//  Compiles an unresolved loop while not zero branch. Then compiles an unresolved
//   branch always forward. Then resolves the loop while not zero branch to go to
//   the current compile buffer location. Then pushes the current compile buffer 
//   location to the data stack and swaps the begin offset that was already there 
//   with the new while offset.
//  In 64 bit address mode when executed,
//   the RCX register is decremented and the loop branch
//   forward into the main part of the BEGIN, NZORLOOPNOTDONEWHILE, REPEAT, loop
//   is taken until RCX is 0 or the zero flag is set. Note that the decrement of
//   RCX does not alter the zero flag. If RCX is 0 or the zero flag is clear, the
//   branch always is taken to the code after the REPEAT,
//
// Note:
//  The branch range is limited to a signed 32 bit integer.
//
// 64 bit address mode xample:
//  BEGIN,
//   loop body part a
//  NZORLOOPNOTDONEWHILE,    // decrements RCX and stays in repeating code
//                           //  until zero flag is set or ECX is 0
//   loop body part b
//  REPEAT,
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthloopnzcomma ( ZSORLOOPDONEUNTIL, EQORLOOPDONEUNTIL, )
//
// C prototype:
//  void dg_forthloopnzcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( beginoffset -- )
//                                                             
// Action:
//  Compiles a resolved loopnz branch back to the beginoffset. When executed,
//   the RCX register is decremented and if RCX is not zero or the zero flag
//   is not set then a branch back to the BEGIN, is taken. Decrementing RCX
//   does not affect the zero flag.
//
// Note:
//  The branch range is limited to a signed 32 bit integer.
//
// Examples:
//  BEGIN,
//   loop body
// ZSORLOOPDONEUNTIL, // loops back to BEGIN, until the zero
//                    //  flag is set or RCX is decremented to 0
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthloopzscomma ( NZORLOOPDONEUNTIL, NEORLOOPDONEUNTIL, )
//
// C prototype:
//  void dg_forthloopzscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( beginoffset -- )
//
// Action:
//  Compiles a resolved loopz branch back to the beginoffset. When executed,
//   the RCX register is decremented and if RCX is not zero or the zero flag
//   is set then a branch back to the BEGIN, is taken. Decrementing RCX
//   does not affect the zero flag.
//
// Note:
//  The branch range is limited to a signed 32 bit integer.
//
// Examples:
//  BEGIN,
//   loop body
//  NZORLOOPDONEUNTIL, // loops until the zero flag is clear or
//                     //  RCX is decremented to 0
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthifcomma ( IF, )
//
// C prototype:
//  void dg_forthifcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( conditioncode -- ifoffset )
//
// Data stack in:
//  conditioncode                 0-15, x86 code for conditional instructions
//                                 is one of:
//                                  VS   for overflow set
//                                  NV   for overflow clear or no overflow
//                                  CS   for carry set
//                                  NC   for overflow clear or no carry
//                                  ULT  for unsigned less than
//                                  ULE  for unsigned less than or equal
//                                  UGT  for unsigned greater than
//                                  UGE  for unsigned greater than or equal
//                                  ZS   for zero set
//                                  NZ   for zero clear or no zero
//                                  EQ   for equal or zero set
//                                  NE   for not equal or zero clear
//                                  SS   for sign set
//                                  NS   for no sign or sign clear
//                                  MI   for minus or sign set
//                                  PL   for plus or sign clear
//                                  PS   for parity set
//                                  NP   for no parity or parity clear
//                                  LT   for signed less than
//                                  GE   for signed greater than or equal
//                                  LE   for signed less than or equal
//                                  GT   for signed greater than
//                                  ALWAYS
//                                  NEVER
//
// Data stack out:
//  ifoffset                      next unused byte in current compile buffer
//                                 which is used by THEN, to resolve the branch
//                     
// Action: 
//  Compiles code that takes an unresolved branch on the opposite condition
//   from the condition code.
//
// Forth standard:
//
// Examples:
//  CS IF,
//   code to do if carry flag was set
//  THEN,
//
//  MI IF,
//   code to do if sign flag was set
//  ELSE,
//   code to do if sign flag was clear
//  THEN,
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthelsecomma ( ELSE, )
//
// C prototype:
//  void dg_forthelsecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( ifoffset -- elseoffset )
// 
// Data stack out:
//  elseoffset                    next unused byte in current compile buffer which
//                                 is used by THEN, to resolve the branch
//                     
// Action: 
//  Compiles an unresolved branch always, then resolves the branch from the IF,
//
// Forth standard:
//
// Example:
//  MI IF,
//   code to do if sign flag was set
//  ELSE,
//   code to do if sign flag was clear
//  THEN,
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forththencomma ( THEN, )
//
// C prototype:
//  void dg_forththencomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( elseoffset|iffoffset -- )
//                     
// Action: 
//  Resolves the branch from the IF, or ELSE,
//
// Forth standard:
//
// Example:
//  MI IF,
//   code to do if sign flag was set
//  ELSE,
//   code to do if sign flag was clear
//  THEN,
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthwhilecomma ( WHILE, )
//
// C prototype:
//  void dg_forthwhilecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
// ( beginoffset conditioncode -- whileoffset beginoffset )
//
// Data stack in:
//  beginoffset                   offset in current compile buffer from when BEGIN,
//                                 was executed.
//
//  conditioncode                 0-15, x86 code for conditional instructions
//                                 is one of:
//                                  VS   for overflow set
//                                  NV   for overflow clear or no overflow
//                                  CS   for carry set
//                                  NC   for overflow clear or no carry
//                                  ULT  for unsigned less than
//                                  ULE  for unsigned less than or equal
//                                  UGT  for unsigned greater than
//                                  UGE  for unsigned greater than or equal
//                                  ZS   for zero set
//                                  NZ   for zero clear or no zero
//                                  EQ   for equal or zero set
//                                  NE   for not equal or zero clear
//                                  SS   for sign set
//                                  NS   for no sign or sign clear
//                                  MI   for minus or sign set
//                                  PL   for plus or sign clear
//                                  PS   for parity set
//                                  NP   for no parity or parity clear
//                                  LT   for signed less than
//                                  GE   for signed greater than or equal
//                                  LE   for signed less than or equal
//                                  GT   for signed greater than
//                                  ALWAYS
//                                  NEVER
//
// Action: 
//  Compiles code that takes an unresolved branch on the opposite condition from
//    the condition code. Then does a SWAP to exchange the beginoffset with the
//    whileoffset.
//
// Forth standard:
//
// Example:
//  BEGIN,
//   loop body part a
//  CS WHILE,           // continue doing loop while carry is set
//   loop body part b
//  REPEAT,
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrepeatcomma ( REPEAT, )
//
// C prototype:
//  void dg_forthrepeatcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( whileoffset beginoffset -- )
//                     
// Action: 
//  Compiles a resovled branch back to the BEGIN, begin offset,
//  Then resovles the branch at the WHILE, whileoffset.
//
// Forth standard:
//
// Example:
//  BEGIN,
//   loop body part a
//  CS WHILE,           // continue doing loop while carry is set
//   loop body part b
//  REPEAT,
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthzeroimmediate ( N )
//
// C prototype:
//  void dg_forthzeroimmediate (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 0 IMMEDIATE )
//                     
// Action: 
//  Is short for 0 IMMEDIATE. This specifies an immediate target which is to be
//   encoded using the smallest possible immediate size.
//
// Note:
//  In 64 bit addressing mode, except for the MOV, instruciton when the other
//   target is a register, the immediate value must fit into 32 bits. Whether
//   or not the value is treated as signed or unsigned depends on the instruction,
//   but most are treated as signed.
//
// Example:
//  3276 N
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isimmediate ( IMMEDIATE )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isimmediate )
//
// Action:
//  Pushes the constant representing IMMEDIATE to the data stack. This constant
//   specifies an immediate target with a specified minimum byte size for
//   encoding the immediate value.
//
// Example:
//  4 IMMEDIATE  // specifies an immediate target and to use at least 4 bytes
//               //  to endode the immediate value
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbracketr ( [R] )
//
// C prototype:
//  void dg_forthbracketr (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 0 0 [MOD] )
//                     
// Action: 
//  Specifies a memory target at the address of held in the register.
//
// Example:
//  ECX [R]
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbracketrplusd ( [R+N] )
//
// C prototype:
//  void dg_forthbracketrplusd (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 0 [MOD] )
//                     
// Action: 
//  Specifies a memory target at address of register plus signed displacement
//   that uses the smallest possible displacement size. The displacement must
//   fit into a signed 32 bit integer.
//
// Example:
//  RCX 8 [R+N]
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isbasedisplacement ( [MOD] )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- [MOD] )
//
// Action:
//  Pushes the constant representing [MOD] to the data stack. This constant
//   specifies a memory target at address of register plus signed displacement
//   that uses the minimum displacement size specified. The displacement must
//   fit into a signed 32 bit integer.
//
// Example:
//  RCX 8 4 [MOD]   // specifies a memory target at address RCX + 8 and to encode
//   the displacement using at leat 4 bytes even if a smaller displacement size is
//   available for this instruction.
//
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbracketd ( [N] )
//
// C prototype:
//  void dg_forthbracketd (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( absoluteaddress -- NOREG absoluteaddress 0 [MOD] )
//                     
// Action: 
//  Specifies a memory target at an absolute address.
//
// Note:
//  The address is always encoded using 32 bits. This means this is probably not
//   all that useful in 64 bit addressing mode. If you do use this in 64 bit
//   addressing mode, absoluteaddress is a signed 32 bit integer. Maybe if you
//   do that thing Mac OS X does where all your segments start at address 0 and
//   if you limit the segment size to what will fit into the largest signed 32
//   bit integer...
//
// Example:
//  C40038 [N]
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbracketrpsxrpd ( [R+S*R+N] )
//
// C prototype:
//  void dg_forthbracketrpsxrpd (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 0 [SIB] )
//                     
// Action: 
//  Specifies a memory target at basereg plus scale times indexreg plus
//   displacement. The displacement is encoded using the smallest size 
//   displacement possible. The displacement must fit into a signed 32 bit
//   integer.
//
// Example:
//  ECX SCALE4* EBX 8 [R+S*R+N]
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isbasescaleindexdisplacement ( [SIB] )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- [SIB] )
//
// Action:
//  Pushes the constant representing [SIB] to the data stack. This constant
//   specifies a memory target at basereg plus scale times indexreg plus
//   displacement. The displacement is encoded using at least the displacement
//   size specified. The displacement must fit into a signed 32 bit
//   integer.
//
// Example:
//  RCX SCALE4* RBX 8 4 [SIB]  // specifies an address target at RCX + 4*RBX + 8
//                             //  and to encode the displacement using at least
//                             //  4 bytes even if there is a smaller displacemtent
//                             //  size available for this instruction
//
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isccbufferoffsetnobracket ( O )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( mybufferoffset -- mybufferoffset dg_isccbufferoffsetnobracket )
//                     
// Action: 
//  None. It's a constant. When used this pushes a constant onto the data stack
//  that specifies an offset in the current compile buffer.
//
// Usage:
//  mybufferoffset O CALL,  ( this becomes nn RIP+N CALL, )
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_iscurrentcompilebufferoffset ( [O] )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( mybufferoffset -- mybufferoffset dg_iscurrentcompilebufferoffset )
//                     
// Action: 
//  None. It's a constant. When used this pushes a constant onto the data stack
//   that specifies the contents of memory at an offset in the current compile 
//   buffer.
//
// Usage:
//  mybufferoffset [O] CALL,  ( this becomes nn [RIP+N] CALL, )
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthimp ( IMP )
//
// C prototype:
//  void dg_forthimp (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  Mac: ( -- RIP 0 4 [MOD] )
//  Win: ( -- RIP 5 4 [MOD] )
//                     
// Action: 
//  Specifies a memory target at address of register plus signed displacement
//   that uses a 32 bit displacement. The displacement must fit into a signed 
//   32 bit integer. This addressing mode is used to help make platform 
//   independent import links.
//
// Usage:
//  IMP CALL,  OIMPORTCODELINK myimportsymbolname
//
// Note:  
//  (2022 April 8)
//  On Mac, the linker adds to the displacement during compile time linking, 
//   and the displacement is the import link. The Mac linker generates an import 
//   link table that is shared among all the .o files being linked. At run time,
//   the loader fills the link table with the true addresses of the imported
//   functions. This way, each symbol only needs to be linked once at run time.
//   For this reason, the displacement of the link has to be 0, and IMP will
//   work with any kind of instruction, not just a call.
//   
//  On Windows, they have different kinds of linking you can specify but at this
//   time I only know how to do the function one. The BUF>NEWEXPORTIMPORT.OBUF
//   function only supports function call imports at this time. 
//  Windows generates a jump table, and then puts the 64 bit address of the jump 
//   instruction into the offset you specify... so what I did was make 
//   OIMPORTCODELINK on Windows compile a branch over the import link. The offset 
//   of 5 is to get to the import link after the 5 byte branch always instruction 
//   used to skip the link.
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isparamusingframe ( FRAME-PARAM )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( parameterindex -- parameterindex dg_isparamusingframe )
//                     
// Action: 
//  None. It's a constant. When used, this pushes a constant onto the data stack
//   that specifies an addressing mode of a parameter passed into the current 
//   subroutine. It also assumes the subroutine is using the RBP frame register 
//   and that the subroutine does RBP PUSH, RSP RBP MOV, at the beginning of the 
//   subroutine. The parameter index gets converted into an xmm register,
//   64 bit integer register, or RBP nn [R+N] where nn is the offset to the
//   parameter. FRAME-PARAM assumes some variables have been set. These
//   variables are set automatically when you use FRAME-PARAMS,< 
//
//  On both Mac and Windows, the variables are:
//   PNUMBEROFINTPARAMS
//   PNUMBEROFFLOATPARAMS
//
//  Mac also uses a variable to say what type extra parameters are which not
//   does not have Forth word to access it at this time. (2022 Apr 9 J.N.)
//
//  Windows also uses 4 variables to say what type each of the first four
//   parameters are. (0 is int, 1 is float). These variable do not have
//   Forth words to access them at this time. (2022 Apr 9 J.N.)
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isparamusingnoframe ( NO-FRAME-PARAM )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( parameterindex -- parameterindex dg_isparamusingnoframe )
//                     
// Action: 
//  None. It's a constant. When used this pushes a constant onto the data stack
//   that specifies an addressing mode of a parameter passed into the current 
//   subroutine. This addressing mode converts the paramter index into an
//   xmm register, 64 bit integer register, or RPS nn [R+N] where nn is the
//   offset to the parameter on the return stack. NO-FRAME-PARAM assumes
//   some variables have been set. These variables are set automatically when
//   you use NO-FRAME-PARAMS<
//  NO-FRAME-PARAM also needs PRSDEPTH set to the number of UINT64 values
//   on the return stack. NO-FRAME-PARAMS< sets this to the operating
//   system dependent default number of items on the return stack at the entry
//   of a function. (Mac is 1, Windows is 5). You need to add 1 to PRSDEPTH
//   for each 64 bit value you pushed onto the return stack in your subroutine
//   at the point you use the parameter in your subroutine in order for
//   NO-FRAME-PARAM to work correctly.
//
//  On both Mac and Windows, the variables are:
//   PNUMBEROFINTPARAMS
//   PNUMBEROFFLOATPARAMS
//   PRSDEPTH
//
//  Mac also uses a variable to say what type extra parameters are which not
//   does not have Forth word to access it at this time. (2022 Apr 9 J.N.)
//
//  Windows also uses 4 variables to say what type each of the first four
//   parameters are. (0 is int, 1 is float). These variable do not have
//   Forth words to access them at this time. (2022 Apr 9 J.N.)
//  
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdatasizebyte ( 8BIT )
//
// C prototype:
//  void dg_forthdatasizebyte (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 1 DATASIZE )
//                     
// Action: 
//  Specifies a data size of 8 bits.
//
// Examples:
//  ECX [R]  8BIT  INC,  // increments the 8 bit value in the memory at the
//                       //  address in ECX
//
//  34 N  ECX [R]  8BIT  MOV,  // moves the 8 bit value 34 into the memory at 
//                             //  the address in ECX. ( Only the lower 8 bits
//                             //  of the immediate target are used. )
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdatasizeword ( 16BIT )
//
// C prototype:
//  void dg_forthdatasizeword (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 2 DATASIZE )
//                     
// Action: 
//  Specifies a data size of 16 bits.
//
// Examples:
//  ECX [R]  16BIT  INC,  // increments the 16 bit value in the memory at the
//                        //  address in ECX
//
//  HEX  1234 N  ECX [R]  16BIT  MOV,  // moves the 16 bit value 1234 into the
//                                     //  memory at the address in ECX.
//                                     // ( Only the lower 16 bits of the
//                                     //  immediate target are used. )
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdatasizedword ( 32BIT )
//
// C prototype:
//  void dg_forthdatasizedword (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 4 DATASIZE )
//                     
// Action: 
//  Specifies a data size of 32 bits.
//
// Examples:
//  ECX [R]  32BIT  INC,  // increments the 32 bit value in memory at the
//                        //  address in ECX
//
//  HEX  87651234 N  ECX [R]  32BIT  MOV,  // moves the 32 bit value 87651234
//                                         // into memory at the address
//                                         // in ECX.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdatasizeqword ( 64BIT )
//
// C prototype:
//  void dg_forthdatasizeqword (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 8 DATASIZE )
//                     
// Action: 
//  Specifies a data size of 64 bits.
//
// Examples:
//  RCX [R]  64BIT  INC,  // increments the 64 bit value in memory at the
//                        //  address in RCX
//
//  HEX  1122334487651234 N  RCX [R]  64BIT  MOV,  // moves the 64 bit value
//                                         // 1122334487651234 into memory at the
//                                         // address in RCX.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthdatasize80bit ( 80BIT )
//
// C prototype:
//  void dg_forthdatasize80bit (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- 10 DATASIZE )
//
// Action:
//  Specifies a data size of 80 bits. Some floating point instructions use
//   80 bit data sizes.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isdatasize ( DATASIZE )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isdatasize )
//
// Action:
//  Pushes the constant representing DATASIZE to the data stack. This constant
//   specifies a data size for a target, usually a memory target.
//
// Example:
//  RAX [R] 8 DATASIZE  // specifies a memory target at the address in RAX with
//                      //  a data size of 8 bytes.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isforward ( -> )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isforward )
//
// Action:
//  Pushes the constant representing -> to the data stack. This constant
//   specifies a forward direction for a target and is ignored because forward
//   is the default direction.
//
// Example:
//  RAX  RDX ->  // does nothing. RAX is source, RDX is destination.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isreverse ( <- )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isreverse )
//
// Action:
//  Pushes the constant representing <- to the data stack. This constant
//   specifies a reverse direction for a target.
//
// Example:
//  RAX  RDX ->  // makes RDX reverse. For most two target instructions, if
//                  //  either target is reverse, then the first target is the
//                  //  destination and the second target is the source.
//
//  RAX -< RDX   // another way to do it
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isreg ( R )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isreg )
//
// Action:
//  Pushes the constant representing R to the data stack. This constant
//   specifies a register target.
//
// Example:
//  RAX R  // specifies RAX as a target. R is optional.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isfloatingpointstackreg ( FPSR )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isfloatingpointstackreg )
//
// Action:
//  Pushes the constant representing FPSR to the data stack. This constant
//   specifies a floating point register target.
//
// Example:
//  ST0 FPSR  // specifies ST0 as a target. FPSR is optional.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isxmmreg ( XMMR )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isfloatingpointstackreg )
//
// Action:
//  Pushes the constant representing XMMR to the data stack. This constant
//   specifies an xmm register target.
//
// Example:
//  XMM0 XMMR  // specifies XMM0 as a target. XMMR is optional.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_iscontrolreg ( CR )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_iscontrolreg )
//
// Action:
//  Pushes the constant representing CR to the data stack. This constant
//   specifies an control register target. I think only the MOVCR instruction
//   supports control register targets.
//
// Example:
//  CR0 CR  // specifies CR0 as a target. CR is optional.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_isdebugreg ( DR )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_isdebugreg )
//
// Action:
//  Pushes the constant representing DR to the data stack. This constant
//   specifies a debug register target. I think only the MOVDR instruction
//   supports debug register targets.
//
// Example:
//  DR0 DR  // specifies DR0 as a target. DR is optional.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_issegmentreg ( SR )
//
// C prototype:
//  None. It's a #defined constant in the diapergluforth.h file
//
// Stack action shorthand:
//  ( -- dg_issegmentreg )
//
// Action:
//  Pushes the constant representing SR to the data stack. This constant
//   specifies a segment register target. I think only the MOVSR instruction
//   supports segment register targets. There are also some other instructions
//   where the segment register target is built into the instruction name.
//
// Example:
//  SRCS SR  // specifies SRCX as a target. SR is optional.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthx86wordlist ( X86-WORDLIST )
//
// C prototype:
//  void dg_forthx86wordlist (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- x86wordlistid )
//                     
// Action: 
//  Pushes the word list id of the x86 assember wordlist onto the data stack.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compilebitoprofr
//
// C prototype:
//  void dg_compilebitoprofr (
//    Bufferhandle* pBHarrayhead,
//    struct dg_Sibformatter* psourceregtarget,
//    struct dg_Sibformatter* pdestregtarget,
//    UINT64 baseopcode)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  struct dg_Sibformatter* psourceregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 count for the bit operation.
//
//  struct dg_Sibformatter* pdestregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 index of the start of the bit array
//                                 for the bit operation.
//
//  UINT64 baseopcode             base opcode for the bit operation
//
// Action: 
//  Compiles a bit operation, forward only I think.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compilebitoprofm
//
// C prototype:
//  void dg_compilebitoprofm (
//    Bufferhandle* pBHarrayhead,
//    struct dg_Sibformatter* psourceregtarget,
//    struct dg_Sibformatter* pdestregtarget,
//    UINT64 baseopcode)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  struct dg_Sibformatter* psourceregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 count for the bit operation.
//
//  struct dg_Sibformatter* pdestregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 index of the start of the bit array
//                                 for the bit operation.
//
//
//   UINT64 baseopcode            base opcode for the bit operation
//
// Action: 
//  Compiles a bit operation, forward only I think.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compilebitopnofr
//
// C prototype:
//  void dg_compilebitopnofr (
//    Bufferhandle* pBHarrayhead,
//    struct dg_Sibformatter* pfirsttarget,  // top on stack
//    struct dg_Sibformatter* psecondtarget, // second on stack
//    UINT64 opcodeextension)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  struct dg_Sibformatter* psourceregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 count for the bit operation
//
//  struct dg_Sibformatter* pdestregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 index of the start of the bit array
//                                 for the bit operation
//
//  UINT64 opcodeextension    opcode extension
//
//
// Action: 
//  Compiles part of a a bit operation I think. I forgot... wish I documented
//   this function when I wrote it :-)
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compilebitopnofm
//
// C prototype:
//  void dg_compilebitopnofm (
//    Bufferhandle* pBHarrayhead,
//    struct dg_Sibformatter* pfirsttarget,  // top on stack
//    struct dg_Sibformatter* psecondtarget, // second on stack
//    UINT64 opcodeextension)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  struct dg_Sibformatter* psourceregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 count for the bit operation
//
//  struct dg_Sibformatter* pdestregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 index of the start of the bit array
//                                 for the bit operation
//
//  UINT64 opcodeextension    opcode extension
//
//
// Action: 
//  Compiles part of a a bit operation I think. I forgot... wish I documented
//   this function when I wrote it :-)
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compilebitop
//
// C prototype:
//  void dg_compilebitop (
//    Bufferhandle* pBHarrayhead,
//    struct dg_Sibformatter* pfirsttarget,  // top on stack
//    struct dg_Sibformatter* psecondtarget, // second on stack
//    UINT64 rofmbaseopcode,
//    UINT64 nofmopcodeextension)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  struct dg_Sibformatter* psourceregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 count for the bit operation
//
//  struct dg_Sibformatter* pdestregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 index of the start of the bit array
//                                 for the bit operation
//
//  UINT64 rofmbaseopcode         base opcode for the bit operation
//
//  UINT64 nofmopcodeextension    nofm whatever that is extension
//
//
// Action: 
//  Compiles a bit operation, handling forward, reverse and the different
//   addressing modes.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compiledshiftoprtom
//
// C prototype:
//  void dg_compiledshiftoprtom (
//   Bufferhandle* pBHarrayhead,
//   struct dg_Sibformatter* pfirsttarget,  // mem operand
//   struct dg_Sibformatter* psecondtarget, // reg operand
//   UINT64 baseopcode)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  struct dg_Sibformatter* psourceregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 count for the bit operation.
//
//  struct dg_Sibformatter* pdestregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 index of the start of the bit array
//                                 for the bit operation.
//
//  UINT64 baseopcode             base opcode for the bit operation
//
// Action: 
//  Compiles a double shift memory register to register or memory instruction.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_compiledshiftop
//
// C prototype:
//  void dg_compiledshiftop (
//   Bufferhandle* pBHarrayhead,
//   struct dg_Sibformatter* pfirsttarget,  // top on stack
//   struct dg_Sibformatter* psecondtarget, // second on stack
//   UINT64 baseopcode)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
//  struct dg_Sibformatter* psourceregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 count for the bit operation.
//
//  struct dg_Sibformatter* pdestregtarget
//                                pointer to SIB formatter structure for the
//                                 source target. The source target holds the
//                                 index of the start of the bit array
//                                 for the bit operation.
//
//  UINT64 baseopcode             base opcode for the bit operation
//
// Action: 
//  Compiles a double shift instruction. The source target must be a register.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshldcomma ( SHLD, )
//
// C prototype:
//  void dg_forthshldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist srctargetparameterlist desttargetparameterlist -- )
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The src target parameter list can contain this addressing mode specifier:
//
//   targetregister
//
//  The dest target parameter list for the target can contain these addressing
//   mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for the dest target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//
//  Description of target parameters:
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  The data size must be either 16 bits, 32 bits, or 64 bits.
//  Pulls three targets from the data stack and compiles the opcode sequence
//   for a double shift left.
//   This opcode sequence shifts the destination count bits to the left.
//   It also shifts the source count bits left into the destination.
//   After the instruction, the source is unchanged.
//  You can use the <- operator to put the desttargetparameterlist
//   before the srctargetparameterlist on the data stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshrdcomma ( SHRD, )
//
// C prototype:
//  void dg_forthshrdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( modeparameterlist srctargetparameterlist desttargetparameterlist -- )
//
//  The mode parameter list can contain these mode specifiers:
//   CL
//   immediatevalue N
//
//  The src target parameter list can contain this addressing mode specifier:
//
//   targetregister
//
//  The dest target parameter list for the target can contain these addressing
//   mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for the dest target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of mode parameters:
//   immediatevalue               this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//   CL                           this is the shift size in bits
//                                 if the data size is 64 bit, only the
//                                 lower 6 bits are used, otherwise only the
//                                 lower 5 bits are used
//
//  Description of target parameters:
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  The data size must be either 16 bits, 32 bits, or 64 bits.
//  Pulls three targets from the data stack and compiles the opcode sequence
//   for a double shift right.
//   This opcode sequence shifts the destination count bits to the right.
//   It also shifts the source count bits right into the destination.
//   After the instruction, the source is unchanged.
//  You can use the <- operator to put the desttargetparameterlist
//   before the srctargetparameterlist on the data stack.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthimulacomma ( IMULA, )
//
// C prototype:
//  void dg_forthimulacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   datavalue datasize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               signed 32 bit value of an immediate target
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, or 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Target x is the destination and must be memory.
//  Pulls one target from the data stack and compiles the opcode sequence for
//   for a signed multiply. In this opcode sequence, the AL, AX, or EAX
//   register is multiplied by the target depending on the size of the target.
//  The data size of the result is double that of the target after the
//   opcode sequence is executed, and the result is stored in the AX, EAX, or
//   EDX:EAX registers depending on the source target size.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthimulcomma ( IMUL, )
//
// C prototype:
//  void dg_forthimulcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   datavalue datasize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               signed 32 bit value of an immediate target
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, or 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Target x is the first source.
//  Target y is the destination must be a register.
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   for a signed multiply. In this opcode sequence, the destination
//   register is multiplied by source target.
//  The data size of the result is the same as that of the sources.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthimulncomma ( IMULN, )
//
// C prototype:
//  void dg_forthimulncomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   datavalue N
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   datavalue datasize IMMEDIATE
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               signed 32 bit value of an immediate target
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible.
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, or 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Target x is the first source.
//  Target y is the destination must be a register.
//  Target z is the second source and must be immediate.
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   for a signed multiply. In this opcode sequence, the two sources are
//   multiplied together and stored in the destination.
//  The data size of the result is the same as that of the sources.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinn8toalcomma ( IN[N8]->AL, )
//
// C prototype:
//  void dg_forthinn8toalcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n8 -- )
//
// Data stack in:
//
//  n8
//
//  Description of target parameters:
//
//   n8               a number from 0 to 255
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 IN[N8]->AL, instruction. The sequence copies a byte from
//   i/o port n8 to the AL register.
//   The number n8 must be in the range of an 8 bit integer and gets 0 extended
//   to 16 bits to form the i/o port address.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  IN[N8]->AL,                   // does IN[28]->AL
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinn8toeaxcomma ( IN[N8]->EAX, )
//
// C prototype:
//  void dg_forthinn8toeaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n8 -- )
//
// Data stack in:
//
//  n8
//
//  Description of target parameters:
//
//   n8               a number from 0 to 255
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 IN[N8]->EAX, instruction. The sequence copies a 32 bit value from
//   i/o port n8 to the EAX register.
//   The number n8 must be in the range of an 8 bit integer and gets 0 extended
//   to 16 bits to form the i/o port address.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  IN[N8]->EAX,                   // does IN[28]->EAX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinn8toaxcomma ( IN[N8]->AX, )
//
// C prototype:
//  void dg_forthinn8toaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n8 -- )
//
// Data stack in:
//
//  n8
//
//  Description of target parameters:
//
//   n8               a number from 0 to 255
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 IN[N8]->AX, instruction. The sequence copies a 16 bit value from
//   i/o port n8 to the AX register.
//   The number n8 must be in the range of an 8 bit integer and gets 0 extended
//   to 16 bits to form the i/o port address.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  IN[N8]->AX,                   // does IN[28]->AX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinccomma ( INC, )
//
// C prototype:
//  void dg_forthinccomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for the target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1, 2, 4, or 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after the addressing mode
//                                 parameters and can not come in the middle
//                                 of the addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//    an x86 INC instruction.
//    This opcode sequence does:
//      target <- target + 1
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  EAX  INC,                      // increments EAX
//  CX  INC,                       // increments CX
//  EDX [R]  32BIT INC,            // increments the 32 bit value at the
//                                 //  the memory location specified by EDX,
//                                 //  size specifier required
//  R8 INC,                        // increments RAX
//                                 //  64 bit address mode only
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinsbcomma ( INSB, )
//
// C prototype:
//  void dg_forthinsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 INSB instruction.
//  In 32 bit addressing mode:
//   Copies an 8 bit value from the i/o port specified by the DX register
//   to [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Copies an 8 bit value from the i/o port specified by the DX register
//   to [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//   Docs say you can use the 0x67 prefix with this opcode to force this
//   instruction to use a smaller version of RDI/EDI. If you use the 0x67
//   prefix in 64 bit address mode, this opcode only uses EDI for the
//   address. If you use the 0x67 prefix in 32 bit address mode, this
//   opcode only uses DI for the address. Not sure how useful this is...
//
// 32 bit addressing mode example:
//  destaddress N  EDI  MOV,
//  count N  ECX  MOV,
//  REP,  INSB,  // fills count bytes at dest with data from i/o port
//
// 64 bit addressing mode example:
//  destaddress N  RDI  MOV,
//  count N  RCX  MOV,
//  REP,  INSB,  // fills count bytes at dest with data from i/o port
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinsdcomma ( INSD, )
//
// C prototype:
//  void dg_forthinsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 INSD instruction.
//  In 32 bit addressing mode:
//   Copies a 32 bit value from the i/o port specified by the DX register
//   to [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Copies a 32 bit value from the i/o port specified by the DX register
//   to [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//   Docs say you can use the 0x67 prefix with this opcode to force this
//   instruction to use a smaller version of RDI/EDI. If you use the 0x67
//   prefix in 64 bit address mode, this opcode only uses EDI for the
//   address. If you use the 0x67 prefix in 32 bit address mode, this
//   opcode only uses DI for the address. Not sure how useful this is...
//
// 32 bit addressing mode example:
//  destaddress N  EDI  MOV,
//  count N  ECX  MOV,
//  REP,  INSD,  // fills count 32 bit values at dest with data from
//               //  i/o port
//
// 64 bit addressing mode example:
//  destaddress N  RDI  MOV,
//  count N  RCX  MOV,
//  REP,  INSD,  // fills count 32 bit values at dest with data from
//               //  i/o port
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinswcomma ( INSW, )
//
// C prototype:
//  void dg_forthinswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 INSW instruction.
//  In 32 bit addressing mode:
//   Copies a 16 bit value from the i/o port specified by the DX register
//   to [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Copies a 16 bit value from the i/o port specified by the DX register
//   to [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//   Docs say you can use the 0x67 prefix with this opcode to force this
//   instruction to use a smaller version of RDI/EDI. If you use the 0x67
//   prefix in 64 bit address mode, this opcode only uses EDI for the
//   address. If you use the 0x67 prefix in 32 bit address mode, this
//   opcode only uses DI for the address. Not sure how useful this is...
//
// 32 bit addressing mode example:
//  destaddress N  EDI  MOV,
//  count N  ECX  MOV,
//  REP,  INSW,  // fills count 16 bit values at dest with data from
//               //  i/o port
//
// 64 bit addressing mode example:
//  destaddress N  RDI  MOV,
//  count N  RCX  MOV,
//  REP,  INSW,  // fills count 16 bit values at dest with data from
//               //  i/o port
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinsertpscomma ( INSERTPS, )
//
// C prototype:
//  void dg_forthinsertpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for target x or y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 INSERTPS instruction. This opcode sequence copies a 32 bit value
//   from the source to the destination. Bits 5 and 6 of the immediate target
//   select which of the four 32 bit regions of the destination gets the value.
//   If the source is an xmm register, bits 6 and 7 choose which of the four
//   32 bit regions are the source, otherwise, the first 32 bit value from the
//   memory target is used. Then, if any of bits 0 to 3 in the immediate target are
//   set, the corresponding 32 bit value in the destination is cleared.
//   If a bit is set to clear a destination value, and that same value is chosen
//   for the copy, the value is cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  INSERTPS,   // [RBX][31:0] -> XMM0[31:0]
//
//  0F N  RBX [R]  XMM0  INSERTPS,   // 0 -> XMM0[31:0]
//                                   // 0 -> XMM0[63:32]
//                                   // 0 -> XMM0[95:64]
//                                   // 0 -> XMM0[127:96]
//
//  40 N  RBX [R]  XMM0  INSERTPS,   // [RBX][63:32] -> XMM0[31:0]
//
//  80 N  RBX [R]  XMM0  INSERTPS,   // [RBX][95:64] -> XMM0[31:0]
//
//  C0 N  RBX [R]  XMM0  INSERTPS,   // [RBX][127:96] -> XMM0[31:0]
//
//  00 N  XMM2  XMM0  INSERTPS,      // XMM2[31:0] -> XMM0[31:0]
//
//  40 N  XMM2  XMM0  INSERTPS,      // XMM2[63:32] -> XMM0[31:0]
//
//  90 N  XMM0 <- XMM2  INSERTPS, // XMM2[95:64] -> XMM0[63:32]
//
//  C0 N  XMM8  XMM0 INSERTPS,       // XMM8[127:96] -> XMM0[31:0]
//
//  C3 N  XMM8  XMM0 INSERTPS,       // 0 -> XMM0[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvinsertpscomma ( VINSERTPS, )
//
// C prototype:
//  void dg_forthvinsertpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for target x or z can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VINSERTPS instruction. This opcode sequence copies a 32 bit value
//   from the source to the destination. Bits 5 and 6 of the immediate target
//   select which of the four 32 bit regions of the destination gets the value.
//   If the source is an xmm register, bits 6 and 7 choose which of the four
//   32 bit regions are the source, otherwise, the first 32 bit value from the
//   memory target is used. Then, if any of bits 0 to 3 in the immediate target are
//   set, the corresponding 32 bit value in the destination is cleared.
//   If a bit is set to clear a destination value, and that same value is chosen
//   for the copy, the value is cleared. The remaining values in the destination
//   that were not cleared or copied from the source are copied from target y.
//   The upper 128 bits of the ymm register associated with the destination are
//   cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VINSERTPS,   // [RBX][31:0]  -> XMM0[31:0]
//                                          // XMM1[127:32] -> XMM0[127:32]
//
//  0F N  RBX [R]  XMM1  XMM0  VINSERTPS,   // 0 -> XMM0[31:0]
//                                          // 0 -> XMM0[63:32]
//                                          // 0 -> XMM0[95:64]
//                                          // 0 -> XMM0[127:96]
//
//  40 N  RBX [R]  XMM1  XMM0  VINSERTPS,   // [RBX][63:32] -> XMM0[31:0]
//                                          // XMM1[127:32] -> XMM0[127:32]
//
//  80 N  RBX [R]  XMM1  XMM0  VINSERTPS,   // [RBX][95:64] -> XMM0[31:0]
//                                          // XMM1[127:32] -> XMM0[127:32]
//
//  C0 N  RBX [R]  XMM1  XMM0  VINSERTPS,   // [RBX][127:96] -> XMM0[31:0]
//                                          // XMM1[127:32]  -> XMM0[127:32]
//
//  00 N  XMM2  XMM1  XMM0  VINSERTPS,      // XMM2[31:0]   -> XMM0[31:0]
//                                          // XMM1[127:32] -> XMM0[127:32]
//
//  40 N  XMM2  XMM1  XMM0  VINSERTPS,      // XMM2[63:32]  -> XMM0[31:0]
//                                          // XMM1[127:32] -> XMM0[127:32]
//
//  90 N  XMM0 <- XMM1  XMM2  VINSERTPS, // XMM2[95:64]  -> XMM0[63:32]
//                                          // XMM1[31:0]   -> XMM0[31:0]
//                                          // XMM1[127:64] -> XMM0[127:64]
//
//  C0 N  XMM8  XMM1  XMM0 VINSERTPS,       // XMM8[127:96] -> XMM0[31:0]
//                                          // XMM1[127:32] -> XMM0[127:32]
//
//  C3 N  XMM8  XMM1  XMM0 VINSERTPS,       // 0 -> XMM0[63:0]
//                                          // XMM1[127:64] -> XMM0[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthintcomma ( INT, )
//
// C prototype:
//  void dg_forthintcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( immediatetargetparameterlist -- )
//
//  immediatetargetparameterlist can be one of:
//   ( immediatevalue N -- )
//   ( immediatevalue  minimumimmediatesize  IMMEDIATE -- )
//
//   immediatevalue               Value of an immediate target. Only the lower
//                                 8 bits are used.
//   minimumimmediatesize         minimum size of immediate target and is ignored
//                                 for INT,
//   IMMEDIATE                    specifies an immediate target. The value for
//                                 this target is encoded into the opcode
//                                 sequence.
//   N                            specifies an immediate target with minimum
//                                 value of size 0
//
// Execute state action:
//  Pulls immediate target from data stack then compiles the opcode string
//   for the x86 INT nn instruction where nn is the lower 8 bits of the
//   immediate target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  3 N  INT,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthint3comma ( INT3, )
//
// C prototype:
//  void dg_forthint3comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 INT3 instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  INT3,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthintocomma ( INTO, )
//
// C prototype:
//  void dg_forthintocomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 INTO instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  INTO,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinvdcomma ( INVD, )
//
// C prototype:
//  void dg_forthinvdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode string for the x86 INVD instruction.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  INVD,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinvlpgcomma ( INVLPG, )
//
// C prototype:
//  void dg_forthinvlpgcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 INVLPG instruction. This opcode sequence invalidates any
//   translation lookaside buffer entries for the page that holds the memory
//   at the target.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  INVLPG,             // invalidate page that contains
//                               //  memory at address = RAX
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
//
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthinvpcidcomma ( INVPCID, )
//
// C prototype:
//  void dg_forthinvpcidcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 INVPCID instruction. This opcode sequence invalidates entries in
//   translation look aside buffers for the descriptor in the memory target
//   using invalidation type in the register target.
//   Data size is ignored for this instruction for memory targets.
//   Register size is ignored for this instruction. No matter what you specify,
//    the register size used will be the same as the address mode size.
//   Direction is ignored for this instruction.
//   One target must be memory, and the other a register. You can specify
//    the targets in either order and this compiling word will figure it out.
//   Intel docs say the invalidation type must be in the range of 0 to 3...
//    if you use a number bigger than that there will be an error. I'm guessing
//    the whole register value is the invalidation type...
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//   RAX [R]  ECX  INVPCID,  // invalidate [RAX] using RCX invalidation type
//
//   RAX  RCX [R]  INVPCID,  // invalidate [RCX] using RAX invalidation type
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovzxcomma ( MOVZX, )
//
// C prototype:
//  void dg_forthmovzxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  The destination target must be a register and must be 16 bits, 32 bits,
//   or 64 bits.
//
//  The source target must be 8 bits or 16 bits.
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for the source target, which you will
//   probably need to do if it is a memory target, you can use one of these:
//   8BIT
//   16BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the source target to
//                                 1 byte
//                                 This is pushed after the source
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the source target to
//                                 2 bytes
//                                 This is pushed after the source
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate 
//                                 value in bytes, can be either 0, 1, 2, or 4
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the source target in bytes,
//                                 can be 1 or 2
//   DATASIZE                     sets the data size of the source target
//                                 This is pushed after the source target
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   for a move with zero extend.
//  Source target must be 8 or 16 bit memory or a register.
//  Destination target must be a 16, 32, or 64 bit register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsxcomma ( MOVSX, )
//
// C prototype:
//  void dg_forthmovsxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  The destination target must be a register and must be 16 bits, 32 bits,
//   or 64 bits.
//
//  The source target must be 8 bits or 16 bits.
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   8BIT
//   16BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the source target to
//                                 1 byte
//                                 This is pushed after the source target
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the source target to
//                                 2 bytes
//                                 This is pushed after the source target
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A 
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the source in bytes,
//                                 can be 1 or 2
//   DATASIZE                     sets the data size of the source target
//                                 This is pushed after the source target
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   for a MOVSX instruction. This opcode sequence copies a signed integer
//   value from the source, sign extends it to the length of the destination
//   and puts the result into the destination.
//
//  Source target must be 8 or 16 bit memory or a register.
//  Destination target must be a 16, 32, or 64 bit register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmovsxdcomma ( MOVSXD, )
//
// C prototype:
//  void dg_forthmovsxdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for a target can contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  The destination target must be a register and must be 64 bits.
//
//  The source target must be 32 bits.
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you need to set the data size for modes requiring it, which are modes
//   that do not have at least one register target, you can use these:
//   32BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the source target to
//                                 4 bytes
//                                 This is pushed after the source target
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the source in bytes,
//                                 can be 4
//   DATASIZE                     sets the data size of the source target
//                                 This is pushed after the source target
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   for a MOVSXD instruction. This opcode sequence takes a value from a 32 bit
//   signed integer source and sign extends it to 64 bits and puts the result
//   into a 64 bit destination register.
//
//  Source target must be 32 bit memory or a register.
//  Destination target must be a 64 bit register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_ntoeaxcomma ( N>EAX, )
//
// C prototype:
//  void dg_ntoeaxcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( N -- )
//
// Data stack in:
//
//  N
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pulls one 32 bit value from the data stack and compiles the opcode sequence to
//   move that 32 bit value into the EAX register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthotormovcomma ( O->RMOV, )
//
// C prototype:
//  void dg_forthotormovcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( currentcompilebufferoffset 32bitregister -- )
//
// Data stack in:
//   currentcompilebufferoffset   32 bit value of an offset in the current
//                                 compile buffer
//
//   32bitregister                one of: EAX EBX ECX EDX EBP ESI EDI ESP
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pops two values from the data stack, and compiles the opcode sequence to
//   move the address of an offset in the current compile buffer to the
//   specified register. This opcode sequence uses several instructions do
//   this operation.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbracketotormovcomma ( [O]->RMOV, )
//
// C prototype:
//  void dg_forthbracketotormovcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( currentcompilebufferoffset 32bitregister -- )
//
// Data stack in:
//   currentcompilebufferoffset   32 bit value of an offset in the current
//                                 compile buffer
//
//   32bitregister                one of: EAX EBX ECX EDX EBP ESI EDI ESP
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pops two values from the data stack, and compiles the opcode sequence to
//   move the 32bit contents of memory at an offset in the current compile
//   buffer to the specified register. This opcode sequence uses several
//   instructions do this operation.
//  This instruction literally does:
//   0 EIP+N CALL, OHERE SWAP - reg POP, EAX SWAP [R+N] MOV,
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthopushcomma ( OPUSH, )
//
// C prototype:
//  void dg_forthopushcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( currentcompilebufferoffset -- )
//
// Data stack in:
//   currentcompilebufferoffset   32 bit value of an offset in the current
//                                 compile buffer
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pops an offset in the current compile buffer from the data stack,
//   and compiles the opcode sequence to calculate and push the address at that
//   offset to the return stack.
//  This instruction literally does:
//   0 EIP+N CALL, OHERE SWAP - ESP [R] 32BIT ADD,
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbracketopushcomma ( [O]PUSH, )
//
// C prototype:
//  void dg_forthbracketopushcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( currentcompilebufferoffset -- )
//
// Data stack in:
//   currentcompilebufferoffset   32 bit value of an offset in the current
//                                 compile buffer
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pops an offset in the current compile buffer from the data stack,
//   and compiles the opcode sequence to push the 32 bit value
//   in the memory at the calculated address to the return stack
//  This instruction literally does:
//   0 EIP+N CALL, EAX POP, OHERE SWAP - EAX SWAP, [R+N] 32BIT PUSH,
//  This instruction uses the EAX register.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbracketopopcomma ( [O]POP, )
//
// C prototype:
//  void dg_forthbracketopopcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( currentcompilebufferoffset -- )
//
// Data stack in:
//   currentcompilebufferoffset   32 bit value of an offset in the current
//                                 compile buffer
//
// Data stack out:
//  none
//                                                              
// Execute state action:
//  Pops an offset in the current compile buffer from the data stack,
//   and compiles the opcode sequence to pop the top value on the return
//   stack to the memory at the calculated address
//  This instruction literally does:
//   0 EIP+N CALL, EAX POP, OHERE SWAP - EAX SWAP, [R+N] 32BIT POP,
//  This instruction uses the EAX register.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstrtopstrpushcomma ( $>P$PUSH, )
//
// C prototype:
//  void dg_forthstrtopstrpushcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 the other used as the bufferhandle for the  
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( $ -$- )
//
// String stack in:
//   $                            string to compile into memory
//
// String stack out:
//  none
//                                                              
// Execute state action:
//  Compiles a call to the address after the compiled call plus the length of
//   top string on the string stack. Then copies the string to the memory
//   after the call. Then drops the string off the string stack.
//  You can use this instruction to push the address of a string constant
//   to the return stack. The address is calculated at run time making it
//   useful in position independant code.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... 
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdmsrcomma ( RDMSR, )
//
// C prototype:
//  void dg_forthrdmsrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does:
//   modelspecificreg[ECX] -> EDX:EAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdpmccomma ( RDPMC, )
//
// C prototype:
//  void dg_forthrdpmccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does:
//   performancemonitoringcounter[ECX] -> EDX:EAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdtsccomma ( RDTSC, )
//
// C prototype:
//  void dg_forthrdtsccomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does:
//   timestampcounter -> EDX:EAX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdtscpcomma ( RDTSCP, )
//
// C prototype:
//  void dg_forthrdtscpcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does:
//   timestampcounter -> EDX:EAX
//   timestampcounteraux -> ECX
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthud2comma ( UD2, )
//
// C prototype:
//  void dg_forthud2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that raises an invalid opcode exception.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthverrcomma ( VERR, )
//
// C prototype:
//  void dg_forthverrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 VERR instruction. This opcode sequence verifies if the data segment
//   specified by the source is readable. If it is, the zero flag is set.
//   Otherwise the zero flag is cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  VERR,             // if segment[[RAX]] is readable then 1 -> ZF
//                             // else 0 -> ZF
//
//  AX VERR,                   // if segment[RAX] is readable then 1 -> ZF
//                             // else 0 -> ZF
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthverwcomma ( VERW, )
//
// C prototype:
//  void dg_forthverwcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 VERW instruction. This opcode sequence verifies if the data segment
//   specified by the source is writable. If it is, the zero flag is set.
//   Otherwise the zero flag is cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  VERW,             // if segment[[RAX]] is writable then 1 -> ZF
//                             // else 0 -> ZF
//
//  AX VERW,                   // if segment[RAX] is writable then 1 -> ZF
//                             // else 0 -> ZF
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthwbinvdcomma ( WBINVD, )
//
// C prototype:
//  void dg_forthwbinvdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that flushes the processors internal caches and sends a signal
//   to tell the external cashes to flush.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthwrmsrcomma ( WRMSR, )
//
// C prototype:
//  void dg_forthwrmsrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does:
//   EDX:EAX -> modelspecificreg[ECX]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxacquirecomma ( XACQUIRE, )
//
// C prototype:
//  void dg_forthxacquirecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that starts lock elision for following instructions that
//   support XACQUIRE lock elision.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxendcomma ( XEND, )
//
// C prototype:
//  void dg_forthxendcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that marks the end of an RTM code region.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxgetbvcomma ( XGETBV, )
//
// C prototype:
//  void dg_forthxgetbvcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does extendedcoderegister[ECX] -> EDX:EAX.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxreleasecomma ( XRELEASE, )
//
// C prototype:
//  void dg_forthxreleasecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that ends lock elision started with XACQUIRE.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsetbvcomma ( XSETBV, )
//
// C prototype:
//  void dg_forthxsetbvcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that does EDX:EAX -> extendedcoderegister[ECX]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxtestcomma ( XTEST, )
//
// C prototype:
//  void dg_forthxtestcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Execute state action:
//  Compiles code that clears the zero flag if this instruction executes inside
//   an RTM region or HLE region, otherwise it is set.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmpsadbwcomma ( MPSADBW, )
//
// C prototype:
//  void dg_forthmpsadbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 MPSADBW instruction. This opcode sequence calculates the difference
//   between unsigned bytes from the source and bytes from the destination. Then
//   takes the absolute value of four consecutive differences and adds them up
//   to generate eight 16 bit results. The source start position for each of
//   the results is same for each result and is determined by bits 0 and 1. Bits 0
//   and 1 determine which of the four 32 bit values in the source to use.
//   Bit 2 determines which 32 bit value in the destination to start with, but
//   for each result, the start offset is incremented one byte.
//
//   immediatevalue[1:0]     source value to use
//    0                       source[31:0]
//    1                       source[63:32]
//    2                       source[95:64]
//    3                       source[127:96]
//
//   immediatevalue[2]       destination value to start with
//    0                       destination[31:0]
//    1                       destination[63:32]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  MPSADBW,   // ABS([RBX][7:0]   - XMM0[7:0]) +
//                                  // ABS([RBX][15:8]  - XMM0[15:8]) +
//                                  // ABS([RBX][23:16] - XMM0[23:16]) +
//                                  // ABS([RBX][31:24] - XMM0[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS([RBX][7:0]   - XMM0[15:8]) +
//                                  // ABS([RBX][15:8]  - XMM0[23:16]) +
//                                  // ABS([RBX][23:16] - XMM0[31:24]) +
//                                  // ABS([RBX][31:24] - XMM0[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS([RBX][7:0]   - XMM0[63:56]) +
//                                  // ABS([RBX][15:8]  - XMM0[71:64]) +
//                                  // ABS([RBX][23:16] - XMM0[79:72]) +
//                                  // ABS([RBX][31:24] - XMM0[87:80])
//                                  //  -> XMM0[127:112]
//
//  01 N  RBX [R]  XMM0  MPSADBW,   // ABS([RBX][39:32] - XMM0[7:0]) +
//                                  // ABS([RBX][47:40] - XMM0[15:8]) +
//                                  // ABS([RBX][55:48] - XMM0[23:16]) +
//                                  // ABS([RBX][63:56] - XMM0[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS([RBX][39:32] - XMM0[15:8]) +
//                                  // ABS([RBX][47:40] - XMM0[23:16]) +
//                                  // ABS([RBX][55:48] - XMM0[31:24]) +
//                                  // ABS([RBX][63:56] - XMM0[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS([RBX][39:32] - XMM0[63:56]) +
//                                  // ABS([RBX][47:40] - XMM0[71:64]) +
//                                  // ABS([RBX][55:48] - XMM0[79:72]) +
//                                  // ABS([RBX][63:56] - XMM0[87:80])
//                                  //  -> XMM0[127:112]
//
//  02 N  RBX [R]  XMM0  MPSADBW,   // ABS([RBX][7:0]   - XMM0[39:32]) +
//                                  // ABS([RBX][15:8]  - XMM0[47:40]) +
//                                  // ABS([RBX][23:16] - XMM0[55:48]) +
//                                  // ABS([RBX][31:24] - XMM0[63:56])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS([RBX][7:0]   - XMM0[47:40]) +
//                                  // ABS([RBX][15:8]  - XMM0[55:48]) +
//                                  // ABS([RBX][23:16] - XMM0[63:56]) +
//                                  // ABS([RBX][31:24] - XMM0[71:64])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS([RBX][7:0]   - XMM0[95:88]) +
//                                  // ABS([RBX][15:8]  - XMM0[103:96]) +
//                                  // ABS([RBX][23:16] - XMM0[111:104]) +
//                                  // ABS([RBX][31:24] - XMM0[119:112])
//                                  //  -> XMM0[127:112]
//
//  01 N  XMM2  XMM0  MPSADBW,      // ABS(XMM2[39:32] - XMM0[7:0]) +
//                                  // ABS(XMM2[47:40] - XMM0[15:8]) +
//                                  // ABS(XMM2[55:48] - XMM0[23:16]) +
//                                  // ABS(XMM2[63:56] - XMM0[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS(XMM2[39:32] - XMM0[15:8]) +
//                                  // ABS(XMM2[47:40] - XMM0[23:16]) +
//                                  // ABS(XMM2[55:48] - XMM0[31:24]) +
//                                  // ABS(XMM2[63:56] - XMM0[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS(XMM2[39:32] - XMM0[63:56]) +
//                                  // ABS(XMM2[47:40] - XMM0[71:64]) +
//                                  // ABS(XMM2[55:48] - XMM0[79:72]) +
//                                  // ABS(XMM2[63:56] - XMM0[87:80])
//                                  //  -> XMM0[127:112]
//
//  01 N  XMM0 <- XMM2  MPSADBW, // ABS(XMM2[39:32] - XMM0[7:0]) +
//                                  // ABS(XMM2[47:40] - XMM0[15:8]) +
//                                  // ABS(XMM2[55:48] - XMM0[23:16]) +
//                                  // ABS(XMM2[63:56] - XMM0[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS(XMM2[39:32] - XMM0[15:8]) +
//                                  // ABS(XMM2[47:40] - XMM0[23:16]) +
//                                  // ABS(XMM2[55:48] - XMM0[31:24]) +
//                                  // ABS(XMM2[63:56] - XMM0[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS(XMM2[39:32] - XMM0[63:56]) +
//                                  // ABS(XMM2[47:40] - XMM0[71:64]) +
//                                  // ABS(XMM2[55:48] - XMM0[79:72]) +
//                                  // ABS(XMM2[63:56] - XMM0[87:80])
//                                  //  -> XMM0[127:112]
//
//  01 N  XMM8  XMM0 MPSADBW,       // ABS(XMM8[39:32] - XMM0[7:0]) +
//                                  // ABS(XMM8[47:40] - XMM0[15:8]) +
//                                  // ABS(XMM8[55:48] - XMM0[23:16]) +
//                                  // ABS(XMM8[63:56] - XMM0[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS(XMM8[39:32] - XMM0[15:8]) +
//                                  // ABS(XMM8[47:40] - XMM0[23:16]) +
//                                  // ABS(XMM8[55:48] - XMM0[31:24]) +
//                                  // ABS(XMM8[63:56] - XMM0[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS(XMM8[39:32] - XMM0[63:56]) +
//                                  // ABS(XMM8[47:40] - XMM0[71:64]) +
//                                  // ABS(XMM8[55:48] - XMM0[79:72]) +
//                                  // ABS(XMM8[63:56] - XMM0[87:80])
//                                  //  -> XMM0[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvmpsadbwcomma ( VMPSADBW, )
//
// C prototype:
//  void dg_forthvmpsadbwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPSADBW instruction. If the destination target is an xmm register,
//   then this opcode sequence calculates the difference
//   between unsigned bytes from the source and bytes from the target y. Then
//   takes the absolute value of four consecutive differences and adds them up
//   to generate eight 16 bit results. The source start position for each of
//   the results is same for each result and is determined by bits 0 and 1. Bits 0
//   and 1 determine which of the four 32 bit values in the source to use.
//   Bit 2 determines which 32 bit value in target y to start with, but
//   for each result, the start offset is incremented one byte.
//   If the destination is a ymm register, then this opcode sequence does the
//   same thing for the lower 128 bits of the ymm register as if it were an
//   xmm register. Then for the upper 128 bits it does not keep sliding target
//   y from the position it ended with in the lower 128 bits. Instead it uses 
//   bits 5 and 6 to determine which of the four 32 bit values in the upper 128 bits
//   of the source to use for the source, and bit 7 of the 
//   immediate value to determine which 32 bit section of the upper 128 bits of
//   target y to start with for sliding target y.
//
//   immediatevalue[1:0]     source value to use for lower 128 bits
//    0                       source[31:0]
//    1                       source[63:32]
//    2                       source[95:64]
//    3                       source[127:96]
//
//   immediatevalue[2]       target y value to start with for lower 128 bits
//    0                       targety[31:0]
//    1                       targety[63:32]
//
//   immediatevalue[5:6]     source value to use for upper 128 bits
//    0                       source[159:128]
//    1                       source[191:160]
//    2                       source[223:192]
//    3                       source[255:224]
//
//   immediatevalue[7]       target y value to start with for upper 128 bits
//    0                       targety[159:128]
//    1                       targety[191:160]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VPSADBW,   
//                                  // ABS([RBX][7:0]   - XMM1[7:0]) +
//                                  // ABS([RBX][15:8]  - XMM1[15:8]) +
//                                  // ABS([RBX][23:16] - XMM1[23:16]) +
//                                  // ABS([RBX][31:24] - XMM1[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS([RBX][7:0]   - XMM1[15:8]) +
//                                  // ABS([RBX][15:8]  - XMM1[23:16]) +
//                                  // ABS([RBX][23:16] - XMM1[31:24]) +
//                                  // ABS([RBX][31:24] - XMM1[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS([RBX][7:0]   - XMM1[63:56]) +
//                                  // ABS([RBX][15:8]  - XMM1[71:64]) +
//                                  // ABS([RBX][23:16] - XMM1[79:72]) +
//                                  // ABS([RBX][31:24] - XMM1[87:80])
//                                  //  -> XMM0[127:112]
//
//  01 N  RBX [R]  XMM1  XMM0  VPSADBW,   
//                                  // ABS([RBX][39:32] - XMM1[7:0]) +
//                                  // ABS([RBX][47:40] - XMM1[15:8]) +
//                                  // ABS([RBX][55:48] - XMM1[23:16]) +
//                                  // ABS([RBX][63:56] - XMM1[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS([RBX][39:32] - XMM1[15:8]) +
//                                  // ABS([RBX][47:40] - XMM1[23:16]) +
//                                  // ABS([RBX][55:48] - XMM1[31:24]) +
//                                  // ABS([RBX][63:56] - XMM1[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS([RBX][39:32] - XMM1[63:56]) +
//                                  // ABS([RBX][47:40] - XMM1[71:64]) +
//                                  // ABS([RBX][55:48] - XMM1[79:72]) +
//                                  // ABS([RBX][63:56] - XMM1[87:80])
//                                  //  -> XMM0[127:112]
//
//  02 N  RBX [R]  XMM1  XMM0  VPSADBW,   
//                                  // ABS([RBX][7:0]   - XMM1[39:32]) +
//                                  // ABS([RBX][15:8]  - XMM1[47:40]) +
//                                  // ABS([RBX][23:16] - XMM1[55:48]) +
//                                  // ABS([RBX][31:24] - XMM1[63:56])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS([RBX][7:0]   - XMM1[47:40]) +
//                                  // ABS([RBX][15:8]  - XMM1[55:48]) +
//                                  // ABS([RBX][23:16] - XMM1[63:56]) +
//                                  // ABS([RBX][31:24] - XMM1[71:64])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS([RBX][7:0]   - XMM1[95:88]) +
//                                  // ABS([RBX][15:8]  - XMM1[103:96]) +
//                                  // ABS([RBX][23:16] - XMM1[111:104]) +
//                                  // ABS([RBX][31:24] - XMM1[119:112])
//                                  //  -> XMM0[127:112]
//
//  01 N  XMM2  XMM1  XMM0  VPSADBW,      
//                                  // ABS(XMM2[39:32] - XMM1[7:0]) +
//                                  // ABS(XMM2[47:40] - XMM1[15:8]) +
//                                  // ABS(XMM2[55:48] - XMM1[23:16]) +
//                                  // ABS(XMM2[63:56] - XMM1[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS(XMM2[39:32] - XMM1[15:8]) +
//                                  // ABS(XMM2[47:40] - XMM1[23:16]) +
//                                  // ABS(XMM2[55:48] - XMM1[31:24]) +
//                                  // ABS(XMM2[63:56] - XMM1[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS(XMM2[39:32] - XMM1[63:56]) +
//                                  // ABS(XMM2[47:40] - XMM1[71:64]) +
//                                  // ABS(XMM2[55:48] - XMM1[79:72]) +
//                                  // ABS(XMM2[63:56] - XMM1[87:80])
//                                  //  -> XMM0[127:112]
//
//  01 N  XMM0 <- XMM1  XMM2  VPSADBW, 
//                                  // ABS(XMM2[39:32] - XMM1[7:0]) +
//                                  // ABS(XMM2[47:40] - XMM1[15:8]) +
//                                  // ABS(XMM2[55:48] - XMM1[23:16]) +
//                                  // ABS(XMM2[63:56] - XMM1[31:24])
//                                  //  -> XMM0[15:0]
//
//                                  // ABS(XMM2[39:32] - XMM1[15:8]) +
//                                  // ABS(XMM2[47:40] - XMM1[23:16]) +
//                                  // ABS(XMM2[55:48] - XMM1[31:24]) +
//                                  // ABS(XMM2[63:56] - XMM1[39:32])
//                                  //  -> XMM0[31:16]
//
//                                  ...
//
//                                  // ABS(XMM2[39:32] - XMM1[63:56]) +
//                                  // ABS(XMM2[47:40] - XMM1[71:64]) +
//                                  // ABS(XMM2[55:48] - XMM1[79:72]) +
//                                  // ABS(XMM2[63:56] - XMM1[87:80])
//                                  //  -> XMM0[127:112]
//
//  11 N  YMM8  YMM1  YMM0 VPSADBW, //  lower 128 bit section     
//                                  // ABS(YMM8[39:32] - YMM1[7:0]) +
//                                  // ABS(YMM8[47:40] - YMM1[15:8]) +
//                                  // ABS(YMM8[55:48] - YMM1[23:16]) +
//                                  // ABS(YMM8[63:56] - YMM1[31:24])
//                                  //  -> YMM0[15:0]
//
//                                  // ABS(YMM8[39:32] - YMM1[15:8]) +
//                                  // ABS(YMM8[47:40] - YMM1[23:16]) +
//                                  // ABS(YMM8[55:48] - YMM1[31:24]) +
//                                  // ABS(YMM8[63:56] - YMM1[39:32])
//                                  //  -> YMM0[31:16]
//
//                                  ...
//
//                                  // ABS(YMM8[39:32] - YMM1[63:56]) +
//                                  // ABS(YMM8[47:40] - YMM1[71:64]) +
//                                  // ABS(YMM8[55:48] - YMM1[79:72]) +
//                                  // ABS(YMM8[63:56] - YMM1[87:80])
//                                  //  -> &MM0[127:112]
//
//                                  //  upper 128 bit section
//                                  // ABS(YMM8[167:160] - YMM1[135:128]) +
//                                  // ABS(YMM8[175:168] - YMM1[143:136]) +
//                                  // ABS(YMM8[183:176] - YMM1[151:144]) +
//                                  // ABS(YMM8[191:184] - YMM1[159:152])
//                                  //  -> YMM0[159:128]
//
//                                  // ABS(YMM8[167:160] - YMM1[143:136]) +
//                                  // ABS(YMM8[175:168] - YMM1[151:144]) +
//                                  // ABS(YMM8[183:176] - YMM1[159:152]) +
//                                  // ABS(YMM8[191:184] - YMM1[167:160])
//                                  //  -> YMM0[191:160]
//
//                                  ...
//
//                                  // ABS(YMM8[167:160] - YMM1[191:184]) +
//                                  // ABS(YMM8[175:168] - YMM1[199:192]) +
//                                  // ABS(YMM8[183:176] - YMM1[207:200]) +
//                                  // ABS(YMM8[191:184] - YMM1[215:208])
//                                  //  -> &MM0[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm, or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpblendwcomma ( PBLENDW, )
//
// C prototype:
//  void dg_forthpblendwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PBLENDW instruction. This sequence copies 16 bit values from the source
//   to the destination if the bit for the value in the immediate target is set.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PBLENDW,     // if IMM[0]  is set then [RBX][15:0]    -> XMM1[15:0]
//                              // if IMM[1]  is set then [RBX][31:16]   -> XMM1[31:16]
//                              // if IMM[2]  is set then [RBX][47:32]   -> XMM1[47:32]
//                              // if IMM[3]  is set then [RBX][63:48]   -> XMM1[63:48]
//                              // if IMM[4]  is set then [RBX][79:64]   -> XMM1[79:64]
//                              // if IMM[5]  is set then [RBX][95:80]   -> XMM1[95:80]
//                              // if IMM[6]  is set then [RBX][111:96]  -> XMM1[111:96]
//                              // if IMM[7]  is set then [RBX][127:112] -> XMM1[127:112]
//
//  XMM2  XMM1  PBLENDW,        // if IMM[0]  is set then XMM2[15:0]    -> XMM1[15:0]
//                              // if IMM[1]  is set then XMM2[31:16]   -> XMM1[31:16]
//                              // if IMM[2]  is set then XMM2[47:32]   -> XMM1[47:32]
//                              // if IMM[3]  is set then XMM2[63:48]   -> XMM1[63:48]
//                              // if IMM[4]  is set then XMM2[79:64]   -> XMM1[79:64]
//                              // if IMM[5]  is set then XMM2[95:80]   -> XMM1[95:80]
//                              // if IMM[6]  is set then XMM2[111:96]  -> XMM1[111:96]
//                              // if IMM[7]  is set then XMM2[127:112] -> XMM1[127:112]
//
//  XMM1 <-  XMM2  PBLENDW,  // if IMM[0]  is set then XMM2[15:0]    -> XMM1[15:0]
//                              // if IMM[1]  is set then XMM2[31:16]   -> XMM1[31:16]
//                              // if IMM[2]  is set then XMM2[47:32]   -> XMM1[47:32]
//                              // if IMM[3]  is set then XMM2[63:48]   -> XMM1[63:48]
//                              // if IMM[4]  is set then XMM2[79:64]   -> XMM1[79:64]
//                              // if IMM[5]  is set then XMM2[95:80]   -> XMM1[95:80]
//                              // if IMM[6]  is set then XMM2[111:96]  -> XMM1[111:96]
//                              // if IMM[7]  is set then XMM2[127:112] -> XMM1[127:112]
//
//  XMM8  XMM1  PBLENDW,        // if IMM[0]  is set then XMM8[15:0]    -> XMM1[15:0]
//                              // if IMM[1]  is set then XMM8[31:16]   -> XMM1[31:16]
//                              // if IMM[2]  is set then XMM8[47:32]   -> XMM1[47:32]
//                              // if IMM[3]  is set then XMM8[63:48]   -> XMM1[63:48]
//                              // if IMM[4]  is set then XMM8[79:64]   -> XMM1[79:64]
//                              // if IMM[5]  is set then XMM8[95:80]   -> XMM1[95:80]
//                              // if IMM[6]  is set then XMM8[111:96]  -> XMM1[111:96]
//                              // if IMM[7]  is set then XMM8[127:112] -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpclmulqdqcomma ( PCLMULQDQ, )
//
// C prototype:
//  void dg_forthpclmulqdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PCLMULQDQ instruction. This sequence performs a carry less
//   multiplication between a 64 bit value from the source and a 64 bit value
//   from the destination. Bit 0 of the immediate value determines which
//   of the high or low 64 bit values to use from the destination,
//   Bit 4 of the immediate value determines which of the high or low 64 bit
//   values to use from the source.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  PCLMULQDQ,    
//                    // PCMUL([RBX][63:0], XMM1[63:0])     -> XMM1[127:0]
//
//  01 N  RBX [R]  XMM1  PCLMULQDQ,    
//                    // PCMUL([RBX][63:0], XMM1[127:64])   -> XMM1[127:0]
//
//  10 N  RBX [R]  XMM1  PCLMULQDQ,    
//                    // PCMUL([RBX][127:64], XMM1[63:0])   -> XMM1[127:0]
//
//  11 N  RBX [R]  XMM1  PCLMULQDQ,    
//                    // PCMUL([RBX][127:64], XMM1[127:64]) -> XMM1[127:0]
//
//  01 N  XMM2  XMM1  PCLMULQDQ,       
//                    // PCMUL(XMM2[63:0], XMM1[127:64])     -> XMM1[127:0]
//
//  01 N  XMM1 <-  XMM2  PCLMULQDQ, 
//                    // PCMUL(XMM2[63:0], XMM1[127:64])     -> XMM1[127:0]
//
//  01 N  XMM8  XMM1  PCLMULQDQ,       
//                    // PCMUL(XMM8[63:0], XMM1[127:64])     -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpclmulqdqcomma ( VPCLMULQDQ, )
//
// C prototype:
//  void dg_forthvpclmulqdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPCLMULQDQ instruction. This sequence performs a carry less
//   multiplication between a 64 bit value from the source and a 64 bit value
//   from target y. Bit 0 of the immediate value determines which
//   of the high or low 64 bit values to use from target y,
//   Bit 4 of the immediate value determines which of the high or low 64 bit
//   values to use from the source.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  XMM1  VPCLMULQDQ,    
//                   // PCMUL([RBX][63:0], XMM0[63:0])     -> XMM1[127:0]
//
//  01 N  RBX [R]  XMM0  XMM1  VPCLMULQDQ,    
//                   // PCMUL([RBX][63:0], XMM0[127:64])   -> XMM1[127:0]
//
//  10 N  RBX [R]  XMM0  XMM1  VPCLMULQDQ,    
//                   // PCMUL([RBX][127:64], XMM0[63:0])   -> XMM1[127:0]
//
//  11 N  RBX [R]  XMM0  XMM1  VPCLMULQDQ,    
//                   // PCMUL([RBX][127:64], XMM0[127:64]) -> XMM1[127:0]
//
//  01 N  XMM2  XMM0  XMM1  VPCLMULQDQ,       
//                   // PCMUL(XMM2[63:0], XMM0[127:64])     -> XMM1[127:0]
//
//  01 N  XMM1 <-  XMM0  XMM2  VPCLMULQDQ, 
//                   // PCMUL(XMM2[63:0], XMM0[127:64])     -> XMM1[127:0]
//
//  01 N  XMM8  XMM0  XMM1  VPCLMULQDQ,       
//                   // PCMUL(XMM8[63:0], XMM0[127:64])     -> XMM1[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpestricomma ( PCMPESTRI, )
//
// C prototype:
//  void dg_forthpcmpestricomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls thre targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPESTRI instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. The flags are set in a
//   non standard way based on the results of the compares.
//   This sequence uses the absolute value of EAX(RAX)) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   If IMM[6] is clear, the index of the first matching compare is returned in
//   ECX. Otherwise, the index of the last matching compare is returned in ECX.
//   If nothing matched, ECX equals 16 if you are comparing bytes, or 8 if you
//   are comparing 16 bit values.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          ECX returns the index of the first true compare
//   1          ECX returns the index of the last true compare
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  PCMPESTRI,  // ECX will have index of first destination byte
//                                   //  that matches any of the bytes in the source
//
//  40 N  RBX [R]  XMM1  PCMPESTRI,  // ECX will have index of last destination byte
//                                   //  that matches any of the bytes in the source
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  PCMPESTRI,  // ECX will have index of first byte where
//                                   //  RBX[i] = XMM1[i] out of first 5 bytes
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  PCMPESTRI,     // ECX will have index of first byte where
//                                   //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  PCMPESTRI,  // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  PCMPESTRI,        // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  if none match in first 5, RCX = HEX 10
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpestricomma ( VPCMPESTRI, )
//
// C prototype:
//  void dg_forthvpcmpestricomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls thre targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPESTRI instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. The flags are set in a
//   non standard way based on the results of the compares.
//   This sequence uses the absolute value of EAX(RAX)) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   If IMM[6] is clear, the index of the first matching compare is returned in
//   ECX. Otherwise, the index of the last matching compare is returned in ECX.
//   If nothing matched, ECX equals 16 if you are comparing bytes, or 8 if you
//   are comparing 16 bit values.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          ECX returns the index of the first true compare
//   1          ECX returns the index of the last true compare
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  VPCMPESTRI, // ECX will have index of first destination byte
//                                   //  that matches any of the bytes in the source
//
//  40 N  RBX [R]  XMM1  VPCMPESTRI, // ECX will have index of last destination byte
//                                   //  that matches any of the bytes in the source
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  VPCMPESTRI, // ECX will have index of first byte where
//                                   //  RBX[i] = XMM1[i] out of first 5 bytes
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  VPCMPESTRI,    // ECX will have index of first byte where
//                                   //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  VPCMPESTRI, // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  VPCMPESTRI,       // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  if none match in first 5, RCX = HEX 10
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpestrmcomma ( PCMPESTRM, )
//
// C prototype:
//  void dg_forthpcmpestrmcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPESTRM instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. The flags are set in a
//   non standard way based on the results of the compares.
//   This sequence uses the absolute value of EAX(RAX)) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   The result of each compare is returned in XMM0.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          XMM0 returns the results of each compare as a bit.
//               (E.g. the result of the first compare is in bit 0.
//               The unused upper bits of XMM0 are cleared.)
//   1          XMM0 returns the results of each compare as a byte.
//               (E.g. the result of the first compare is in byte 0. If the first
//               compare was false, byte 0 = 0, otherwise byte 0 = HEX FF)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  PCMPESTRM,  // XMM0 will have bit mask of which destination
//                                   //  bytes match any of the source bytes
//
//  40 N  RBX [R]  XMM1  PCMPESTRM,  // XMM0 will have byte mask of which destination
//                                   //  bytes match any of the source bytes
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  PCMPESTRM,  // XMM0 will have bits set of when [RBX][i] = XMM1[i]
//                                   //  out of the first 5 bytes.
//
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  PCMPESTRM,     // ECX will have index of first byte where
//                                   // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                   //  out of the first 5 bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  PCMPESTRM,  // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                      //  out of the first 5 bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  PCMPESTRM,        // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM8[i] = XMM1[i]
//                                      //  out of the first 5 bytes.
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpestrmcomma ( VPCMPESTRM, )
//
// C prototype:
//  void dg_forthvpcmpestrmcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPESTRM instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. The flags are set in a
//   non standard way based on the results of the compares.
//   This sequence uses the absolute value of EAX(RAX)) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   The result of each compare is returned in XMM0.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          XMM0 returns the results of each compare as a bit.
//               (E.g. the result of the first compare is in bit 0.
//               The unused upper bits of XMM0 are cleared.)
//   1          XMM0 returns the results of each compare as a byte.
//               (E.g. the result of the first compare is in byte 0. If the first
//               compare was false, byte 0 = 0, otherwise byte 0 = HEX FF)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  VPCMPESTRM, // XMM0 will have bit mask of which destination
//                                   //  bytes match any of the source bytes
//
//  40 N  RBX [R]  XMM1  VPCMPESTRM, // XMM0 will have byte mask of which destination
//                                   //  bytes match any of the source bytes
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  VPCMPESTRM, // XMM0 will have bits set of when [RBX][i] = XMM1[i]
//                                   //  out of the first 5 bytes.
//
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  VPCMPESTRM,    // ECX will have index of first byte where
//                                   // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                   //  out of the first 5 bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  VPCMPESTRM, // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                      //  out of the first 5 bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  VPCMPESTRM,       // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM8[i] = XMM1[i]
//                                      //  out of the first 5 bytes.
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpistricomma ( PCMPISTRI, )
//
// C prototype:
//  void dg_forthpcmpistricomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparamterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPISTRI instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. Any characters after
//   and including the first null (0) in the source or destination are considered
//   invalid and do not match. The flags are set in a non standard way based on
//   the results of the compares.
//   This sequence uses the absolute value of EAX(RAX) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   If IMM[6] is clear, the index of the first matching compare is returned in
//   ECX. Otherwise, the index of the last matching compare is returned in ECX.
//   If nothing matched, ECX equals 16 if you are comparing bytes, or 8 if you
//   are comparing 16 bit values.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          ECX returns the index of the first true compare
//   1          ECX returns the index of the last true compare
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  PCMPISTRI,  // ECX will have index of first destination byte
//                                   //  before a null byte that matches any of
//                                   //  the bytes in the source before a
//                                   //  null byte
//
//  40 N  RBX [R]  XMM1  PCMPISTRI,  // ECX will have index of last destination byte
//                                   //  that matches any of the bytes in the
//                                   //  source before any null bytes
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  PCMPISTRI,  // ECX will have index of first byte where
//                                   //  RBX[i] = XMM1[i] out of first 5 bytes
//                                   //  before a null byte is encountered
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  PCMPISTRI,     // ECX will have index of first byte where
//                                   //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                   //  before a null byte is encountered
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  PCMPISTRI,  // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  before a null byte is encountered
//                                      //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  PCMPISTRI,        // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  before a null byte is encountered
//                                      //  if none match in first 5, RCX = HEX 10
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpistricomma ( VPCMPISTRI, )
//
// C prototype:
//  void dg_forthvpcmpistricomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparamterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPISTRI instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. Any characters after
//   and including the first null (0) in the source or destination are considered
//   invalid and do not match. The flags are set in a non standard way based on
//   the results of the compares.
//   This sequence uses the absolute value of EAX(RAX) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   If IMM[6] is clear, the index of the first matching compare is returned in
//   ECX. Otherwise, the index of the last matching compare is returned in ECX.
//   If nothing matched, ECX equals 16 if you are comparing bytes, or 8 if you
//   are comparing 16 bit values.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          ECX returns the index of the first true compare
//   1          ECX returns the index of the last true compare
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  VPCMPISTRI, // ECX will have index of first destination byte
//                                   //  before a null byte that matches any of
//                                   //  the bytes in the source before a
//                                   //  null byte
//
//  40 N  RBX [R]  XMM1  VPCMPISTRI, // ECX will have index of last destination byte
//                                   //  that matches any of the bytes in the
//                                   //  source before any null bytes
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  VPCMPISTRI, // ECX will have index of first byte where
//                                   //  RBX[i] = XMM1[i] out of first 5 bytes
//                                   //  before a null byte is encountered
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  VPCMPISTRI,    // ECX will have index of first byte where
//                                   //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                   //  before a null byte is encountered
//                                   //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  VPCMPISTRI, // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  before a null byte is encountered
//                                      //  if none match in first 5, RCX = HEX 10
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  VPCMPISTRI,       // ECX will have index of first byte where
//                                      //  XMM2[i] = XMM1[i] out of first 5 bytes
//                                      //  before a null byte is encountered
//                                      //  if none match in first 5, RCX = HEX 10
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpcmpistrmcomma ( PCMPISTRM, )
//
// C prototype:
//  void dg_forthpcmpistrmcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PCMPISTRM instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. Any characters after
//   and including the first null (0) in the source or destination are considered
//   invalid and do not match. The flags are set in a non standard way based on
//   the results of the compares.
//   This sequence uses the absolute value of EAX(RAX) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   The result of each compare is returned in XMM0.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          XMM0 returns the results of each compare as a bit.
//               (E.g. the result of the first compare is in bit 0.
//               The unused upper bits of XMM0 are cleared.)
//   1          XMM0 returns the results of each compare as a byte.
//               (E.g. the result of the first compare is in byte 0. If the first
//               compare was false, byte 0 = 0, otherwise byte 0 = HEX FF)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  PCMPISTRM,  // XMM0 will have bit mask of which destination
//                                   //  bytes before the first null byte
//                                   //  match any of the source bytes
//                                   //  before the first null byte
//
//  40 N  RBX [R]  XMM1  PCMPISTRM,  // XMM0 will have byte mask of which destination
//                                   //  bytes before the first null byte
//                                   //  match any of the source bytes
//                                   //  before the first null byte
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  PCMPISTRM,  // XMM0 will have bits set of when [RBX][i] = XMM1[i]
//                                   //  out of the first 5 bytes before any null bytes.
//
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  PCMPISTRM,     // ECX will have index of first byte where
//                                   // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                   //  out of the first 5 bytes before any null bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  PCMPISTRM,  // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                      //  out of the first 5 bytes before any null bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  PCMPISTRM,        // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM8[i] = XMM1[i]
//                                      //  out of the first 5 bytes before any null bytes.
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpistrmcomma ( VPCMPISTRM, )
//
// C prototype:
//  void dg_forthvpcmpistrmcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPISTRM instruction. This sequence compares a string of values in
//   the source with a string of values in the destination using the rules
//   selected with the value of the immediate target. Any characters after
//   and including the first null (0) in the source or destination are considered
//   invalid and do not match. The flags are set in a non standard way based on
//   the results of the compares.
//   This sequence uses the absolute value of EAX(RAX) as the length of the
//   destination. This sequence uses the absolute value of EDX(RDX) as the
//   length of the source.
//   The result of each compare is returned in XMM0.
//
//   Flag       meaning
//    CF         true if any of the compares matched
//    ZF         if comparing bytes, this is set if EDX < 16
//               if comparing 16 bit values, this is set if EDX < 8
//    SF         if comparing bytes, this is set if EAX < 16
//               if comparing 16 bit values, this is set if EAX < 8
//    OF         result of the first compare
//    AF         cleared
//    PF         cleared
//
//  IMM[0]      meaning
//   0           byte values are compared
//   1           16 bit values are compared
//
//  IMM[1]      meaning
//   0           unsigned values
//   1           signed values
//
//  IMM[3:2]    meaning
//   0           (find if values are in a set)
//               I think this one might be...
//               for each destination value:
//                does the destination value match any values in the source
//
//   1           (find if values are in a set of ranges)
//               I think this one might be...
//               for each destination value:
//                is a destination value >= each even indexed value
//                in source, and the destination value <= each odd indexed
//                value in source?
//
//   2           (string compare)
//               do all of dest[i] = src[i], if the strings are different lengths,
//                then the extra characters do not match.
//
//   3           (substring search)
//               I think this one might be...
//               for each destination index:
//                does source string match each destination string starting at index
//
//  IMM[4:5]   meaning
//   0          don't change the results of the compares
//   1          invert the results of the compares
//   2          don't change the results of the compares
//   3          invert the results of the compares unless
//               the source index is beyond the length of the source string
//
//  IMM[6]     meaning
//   0          XMM0 returns the results of each compare as a bit.
//               (E.g. the result of the first compare is in bit 0.
//               The unused upper bits of XMM0 are cleared.)
//   1          XMM0 returns the results of each compare as a byte.
//               (E.g. the result of the first compare is in byte 0. If the first
//               compare was false, byte 0 = 0, otherwise byte 0 = HEX FF)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  EDX  MOV,                  // source length
//  10 N  EAX  MOV,                  // destination length
//  00 N  RBX [R]  XMM1  VPCMPISTRM, // XMM0 will have bit mask of which destination
//                                   //  bytes before the first null byte
//                                   //  match any of the source bytes
//                                   //  before the first null byte
//
//  40 N  RBX [R]  XMM1  VPCMPISTRM, // XMM0 will have byte mask of which destination
//                                   //  bytes before the first null byte
//                                   //  match any of the source bytes
//                                   //  before the first null byte
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  RBX [R]  XMM1  VPCMPISTRM, // XMM0 will have bits set of when [RBX][i] = XMM1[i]
//                                   //  out of the first 5 bytes before any null bytes.
//
//
//  5 N  EDX  MOV,                   // source length
//  5 N  EAX  MOV,                   // destination length
//  08 N  XMM2  XMM1  VPCMPISTRM,    // ECX will have index of first byte where
//                                   // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                   //  out of the first 5 bytes before any null bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM1 <-  XMM2  VPCMPISTRM, // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM2[i] = XMM1[i]
//                                      //  out of the first 5 bytes before any null bytes.
//
//  5 N  EDX  MOV,                      // source length
//  5 N  EAX  MOV,                      // destination length
//  08 N  XMM8  XMM1  VPCMPISTRM,       // ECX will have index of first byte where
//                                      // XMM0 will have bits set of when XMM8[i] = XMM1[i]
//                                      //  out of the first 5 bytes before any null bytes.
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthoutsbcomma ( OUTSB, )
//
// C prototype:
//  void dg_forthoutsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 OUTSB instruction.
//  In 32 bit addressing mode:
//   Copies an 8 bit value to the i/o port specified by the DX register
//   from [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Copies an 8 bit value to the i/o port specified by the DX register
//   from [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//   Docs say you can use the 0x67 prefix with this opcode to force this
//   instruction to use a smaller version of RDI/EDI. If you use the 0x67
//   prefix in 64 bit address mode, this opcode only uses EDI for the
//   address. If you use the 0x67 prefix in 32 bit address mode, this
//   opcode only uses DI for the address. Not sure how useful this is...
//
// 32 bit addressing mode example:
//  srcaddress N  EDI  MOV,
//  count N  ECX  MOV,
//  REP,  OUTSB,  // sends count bytes at srcaddress to i/o port
//
// 64 bit addressing mode example:
//  srcaddress N  RDI  MOV,
//  count N  RCX  MOV,
//  REP,  OUTSB,  // fills count bytes at srcaddress to i/o port
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthoutsdcomma ( OUTSD, )
//
// C prototype:
//  void dg_forthoutsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 OUTSD instruction.
//  In 32 bit addressing mode:
//   Copies a 32 bit value to the i/o port specified by the DX register
//   from [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Copies a 32 bit value to the i/o port specified by the DX register
//   from [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//   Docs say you can use the 0x67 prefix with this opcode to force this
//   instruction to use a smaller version of RDI/EDI. If you use the 0x67
//   prefix in 64 bit address mode, this opcode only uses EDI for the
//   address. If you use the 0x67 prefix in 32 bit address mode, this
//   opcode only uses DI for the address. Not sure how useful this is...
//
// 32 bit addressing mode example:
//  srcaddress N  EDI  MOV,
//  count N  ECX  MOV,
//  REP,  OUTSD,  // sends count 32 bit values at srcaddress to i/o port
//
// 64 bit addressing mode example:
//  destaddress N  RDI  MOV,
//  count N  RCX  MOV,
//  REP,  OUTSD,  // sends count 32 bit values at srcaddress to i/o port
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthoutswcomma ( OUTSW, )
//
// C prototype:
//  void dg_forthoutswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode byte for the x86 OUTSW instruction.
//  In 32 bit addressing mode:
//   Copies a 16 bit value to the i/o port specified by the DX register
//   from [EDI], and decrements ECX.
//   EDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//  In 64 bit addressing mode:
//   Copies a 16 bit value to the i/o port specified by the DX register
//   from [RDI], and decrements RCX.
//   RDI is adjusted according to the direction flag,
//    clear increments and set decrements.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Note:
//   Some operating systems require the direction bit to
//   be left to forward (clear) when passing control to operating system
//   routines.
//   I usually push the flag register when entering and pop it when exiting
//   subroutines to cover this requirement.
//   Docs say you can use the 0x67 prefix with this opcode to force this
//   instruction to use a smaller version of RDI/EDI. If you use the 0x67
//   prefix in 64 bit address mode, this opcode only uses EDI for the
//   address. If you use the 0x67 prefix in 32 bit address mode, this
//   opcode only uses DI for the address. Not sure how useful this is...
//
// 32 bit addressing mode example:
//  srcaddress N  EDI  MOV,
//  count N  ECX  MOV,
//  REP,  OUTSW,  // sends count 16 bit values at srcaddress to i/o port
//
// 64 bit addressing mode example:
//  srcaddress N  RDI  MOV,
//  count N  RCX  MOV,
//  REP,  OUTSW,  // sends count 16 bit values at srcaddress to i/o port
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpshufdcomma ( PSHUFD, )
//
// C prototype:
//  void dg_forthpshufdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PSHUFD instruction. This opcode sequence copies 32 bit values from
//   the source to the destination. Each pair of bits in the 8 bit immediate
//   value determines which source value to copy into a destination value.
//   immediatevalue[1:0] chooses source for dest[31:0]
//   immediatevalue[3:2] chooses source for dest[63:32]
//   immediatevalue[5:4] chooses source for dest[95:64]
//   immediatevalue[7:6] chooses source for dest[127:96]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  PSHUFD,   // [RBX][31:0] -> XMM0[31:0]
//                                 // [RBX][31:0] -> XMM0[63:32]
//                                 // [RBX][31:0] -> XMM0[95:64]
//                                 // [RBX][31:0] -> XMM0[127:96]
//
//  E4 N  RBX [R]  XMM0  PSHUFD,   // [RBX][127:0] -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM0  PSHUFD,   // [RBX][127:96] -> XMM0[31:0]
//                                 // [RBX][127:96] -> XMM0[63:32]
//                                 // [RBX][127:96] -> XMM0[95:64]
//                                 // [RBX][127:96] -> XMM0[127:96]
//
//  02 N  XMM2  XMM0  PSHUFD,      // XMM2[95:64] -> XMM0[31:0]
//                                 // XMM2[31:0]  -> XMM0[63:32]
//                                 // XMM2[31:0]  -> XMM0[95:64]
//                                 // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM2  PSHUFD, // XMM2[95:64] -> XMM0[31:0]
//                                 // XMM2[31:0]  -> XMM0[63:32]
//                                 // XMM2[31:0]  -> XMM0[95:64]
//                                 // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  XMM8  XMM0 PSHUFD,       // XMM8[95:64] -> XMM0[31:0]
//                                 // XMM8[31:0]  -> XMM0[63:32]
//                                 // XMM8[31:0]  -> XMM0[95:64]
//                                 // XMM8[31:0]  -> XMM0[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpshufdcomma ( VPSHUFD, )
//
// C prototype:
//  void dg_forthvpshufdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSHUFD instruction. This opcode sequence copies 32 bit values from
//   the source to the destination. Each pair of bits in the 8 bit immediate
//   value determines which source value to copy from both the lower and upper
//   128 bit sections of the source into the lower and upper 128 bit destination
//   sections of the destination.
//   immediatevalue[1:0] chooses source for dest[31:0] and dest[159:128]
//   immediatevalue[3:2] chooses source for dest[63:32] and dest[191:160]
//   immediatevalue[5:4] chooses source for dest[95:64] and dest[223:192]
//   immediatevalue[7:6] chooses source for dest[127:96] and dest[255:224]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  VPSHUFD,  // [RBX][31:0] -> XMM0[31:0]
//                                 // [RBX][31:0] -> XMM0[63:32]
//                                 // [RBX][31:0] -> XMM0[95:64]
//                                 // [RBX][31:0] -> XMM0[127:96]
//
//  E4 N  RBX [R]  XMM0  VPSHUFD,  // [RBX][127:0] -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM0  VPSHUFD,  // [RBX][127:96] -> XMM0[31:0]
//                                 // [RBX][127:96] -> XMM0[63:32]
//                                 // [RBX][127:96] -> XMM0[95:64]
//                                 // [RBX][127:96] -> XMM0[127:96]
//
//  02 N  XMM2  XMM0  VPSHUFD,     // XMM2[95:64] -> XMM0[31:0]
//                                 // XMM2[31:0]  -> XMM0[63:32]
//                                 // XMM2[31:0]  -> XMM0[95:64]
//                                 // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM2  VPSHUFD, // XMM2[95:64] -> XMM0[31:0]
//                                  // XMM2[31:0]  -> XMM0[63:32]
//                                  // XMM2[31:0]  -> XMM0[95:64]
//                                  // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  YMM8  YMM0 VPSHUFD,      // YMM8[95:64]   -> YMM0[31:0]
//                                 // YMM8[31:0]    -> YMM0[63:32]
//                                 // YMM8[31:0]    -> YMM0[95:64]
//                                 // YMM8[31:0]    -> YMM0[127:96]
//                                 // YMM8[223:192] -> YMM0[159:128]
//                                 // YMM8[159:128] -> YMM0[191:160]
//                                 // YMM8[159:128] -> YMM0[223:192]
//                                 // YMM8[159:128] -> YMM0[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpshufhwcomma ( PSHUFHW, )
//
// C prototype:
//  void dg_forthpshufhwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PSHUFHW instruction. This opcode sequence copies 16 bit values from
//   the high 64 bits of the source to the destination. Each pair of bits in the
//   8 bit immediate value determines which source value to copy into a
//   destination value.
//   immediatevalue[1:0] chooses source for dest[79:64]
//   immediatevalue[3:2] chooses source for dest[95:80]
//   immediatevalue[5:4] chooses source for dest[111:96]
//   immediatevalue[7:6] chooses source for dest[127:112]
//   The low 64 bits of the source are also copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  PSHUFHW,   // [RBX][79:64] -> XMM0[79:64]
//                                  // [RBX][79:64] -> XMM0[95:80]
//                                  // [RBX][79:64] -> XMM0[111:96]
//                                  // [RBX][79:64] -> XMM0[127:112]
//                                  // [RBX][63:0]  -> XMM0[63:0]
//
//  E4 N  RBX [R]  XMM0  PSHUFHW,   // [RBX][127:0] -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM0  PSHUFHW,   // [RBX][127:112] -> XMM0[79:64]
//                                  // [RBX][127:112] -> XMM0[95:80]
//                                  // [RBX][127:112] -> XMM0[111:96]
//                                  // [RBX][127:112] -> XMM0[127:112]
//                                  // [RBX][127:112] -> XMM0[63:0]
//
//  02 N  XMM2  XMM0  PSHUFHW,      // XMM2[111:96] -> XMM0[79:64]
//                                  // XMM2[79:64]  -> XMM0[95:80]
//                                  // XMM2[79:64]  -> XMM0[111:96]
//                                  // XMM2[79:64]  -> XMM0[127:112]
//                                  // XMM2[63:0]   -> XMM0[63:0]
//
//  02 N  XMM0 <- XMM2  PSHUFHW, // XMM2[111:96] -> XMM0[79:64]
//                                  // XMM2[79:64]  -> XMM0[95:80]
//                                  // XMM2[79:64]  -> XMM0[111:96]
//                                  // XMM2[79:64]  -> XMM0[127:112]
//                                  // XMM2[63:0]   -> XMM0[63:0]
//
//  02 N  XMM8  XMM0 PSHUFHW,       // XMM8[111:96] -> XMM0[79:64]
//                                  // XMM8[79:64]  -> XMM0[95:80]
//                                  // XMM8[79:64]  -> XMM0[111:96]
//                                  // XMM8[79:64]  -> XMM0[127:112]
//                                  // XMM8[63:0]   -> XMM0[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpshufhwcomma ( VPSHUFHW, )
//
// C prototype:
//  void dg_forthvpshufhwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSHUFHW instruction. This opcode sequence copies 16 bit values from
//   the high 64 bits of each 128 bit section the source to the destination. 
//   Each pair of bits in the 8 bit immediate value determines which source values 
//   to copy from each 128 bit section of the source to the destination.
//   immediatevalue[1:0] chooses source for dest[79:64] and dest[207:192]
//   immediatevalue[3:2] chooses source for dest[95:80] and dest[223:208]
//   immediatevalue[5:4] chooses source for dest[111:96] and dest[239:224]
//   immediatevalue[7:6] chooses source for dest[127:112] and dest[255:240]
//   The low 64 bits of each 128 bit section of the source are also copied to the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  VPSHUFHW,  // [RBX][79:64] -> XMM0[79:64]
//                                  // [RBX][79:64] -> XMM0[95:80]
//                                  // [RBX][79:64] -> XMM0[111:96]
//                                  // [RBX][79:64] -> XMM0[127:112]
//                                  // [RBX][63:0]  -> XMM0[63:0]
//
//  E4 N  RBX [R]  XMM0  VPSHUFHW,   // [RBX][127:0] -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM0  VPSHUFHW,  // [RBX][127:112] -> XMM0[79:64]
//                                  // [RBX][127:112] -> XMM0[95:80]
//                                  // [RBX][127:112] -> XMM0[111:96]
//                                  // [RBX][127:112] -> XMM0[127:112]
//                                  // [RBX][127:112] -> XMM0[63:0]
//
//  02 N  XMM2  XMM0  VPSHUFHW,     // XMM2[111:96] -> XMM0[79:64]
//                                  // XMM2[79:64]  -> XMM0[95:80]
//                                  // XMM2[79:64]  -> XMM0[111:96]
//                                  // XMM2[79:64]  -> XMM0[127:112]
//                                  // XMM2[63:0]   -> XMM0[63:0]
//
//  02 N  XMM0 <- XMM2  VPSHUFHW, // XMM2[111:96] -> XMM0[79:64]
//                                   // XMM2[79:64]  -> XMM0[95:80]
//                                   // XMM2[79:64]  -> XMM0[111:96]
//                                   // XMM2[79:64]  -> XMM0[127:112]
//                                   // XMM2[63:0]   -> XMM0[63:0]
//
//  02 N  YMM8  YMM0 VPSHUFHW,      // YMM8[111:96]  -> YMM0[79:64]
//                                  // YMM8[79:64]   -> YMM0[95:80]
//                                  // YMM8[79:64]   -> YMM0[111:96]
//                                  // YMM8[79:64]   -> YMM0[127:112]
//                                  // YMM8[63:0]    -> YMM0[63:0]
//                                  // YMM8[250:240] -> YMM0[207:192]
//                                  // YMM8[207:192] -> YMM0[223:208]
//                                  // YMM8[207:192] -> YMM0[239:224]
//                                  // YMM8[207:192] -> YMM0[250:240]
//                                  // YMM8[191:128] -> YMM0[191:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm, or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpshuflwcomma ( PSHUFLW, )
//
// C prototype:
//  void dg_forthpshuflwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PSHUFLW instruction. This opcode sequence copies 16 bit values from
//   the low 64 bits of the source to the destination. Each pair of bits in the
//   8 bit immediate value determines which source value to copy into a
//   destination value.
//   immediatevalue[1:0] chooses source for dest[15:0]
//   immediatevalue[3:2] chooses source for dest[31:16]
//   immediatevalue[5:4] chooses source for dest[47:32]
//   immediatevalue[7:6] chooses source for dest[63:48]
//   The high 64 bits of the source are also copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  PSHUFLW,   // [RBX][15:0]   -> XMM0[15:0]
//                                  // [RBX][15:0]   -> XMM0[31:16]
//                                  // [RBX][15:0]   -> XMM0[47:32]
//                                  // [RBX][15:0]   -> XMM0[63:48]
//                                  // [RBX][127:64] -> XMM0[127:64]
//
//  E4 N  RBX [R]  XMM0  PSHUFLW,   // [RBX][127:0]  -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM0  PSHUFLW,   // [RBX][63:48]  -> XMM0[15:0]
//                                  // [RBX][63:48]  -> XMM0[31:16]
//                                  // [RBX][63:48]  -> XMM0[47:32]
//                                  // [RBX][63:48]  -> XMM0[63:48]
//                                  // [RBX][127:64] -> XMM0[127:64]
//
//  02 N  XMM2  XMM0  PSHUFLW,      // XMM2[47:32]  -> XMM0[15:0]
//                                  // XMM2[15:0]   -> XMM0[31:16]
//                                  // XMM2[15:0]   -> XMM0[47:32]
//                                  // XMM2[15:0]   -> XMM0[63:48]
//                                  // XMM2[127:64] -> XMM0[127:64]
//
//  02 N  XMM0 <- XMM2  PSHUFLW, // XMM2[47:32]  -> XMM0[15:0]
//                                  // XMM2[15:0]   -> XMM0[31:16]
//                                  // XMM2[15:0]   -> XMM0[47:32]
//                                  // XMM2[15:0]   -> XMM0[63:48]
//                                  // XMM2[127:64] -> XMM0[127:64]
//
//  02 N  XMM8  XMM0 PSHUFLW,       // XMM8[47:32]  -> XMM0[15:0]
//                                  // XMM8[15:0]   -> XMM0[31:16]
//                                  // XMM8[15:0]   -> XMM0[47:32]
//                                  // XMM8[15:0]   -> XMM0[63:48]
//                                  // XMM8[127:64] -> XMM0[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpshuflwcomma ( VPSHUFLW, )
//
// C prototype:
//  void dg_forthvpshuflwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSHUFLW instruction. This opcode sequence copies 16 bit values from
//   the low 64 bits of each 128 bit section of the the source to the destination. 
//   Each pair of bits in the 8 bit immediate value determines which source values
//   to copy from each 128 bit section of the source to the destination.
//  
//   immediatevalue[1:0] chooses source for dest[15:0]  and dest[143:128]
//   immediatevalue[3:2] chooses source for dest[31:16] and dest[159:144]
//   immediatevalue[5:4] chooses source for dest[47:32] and dest[175:160]
//   immediatevalue[7:6] chooses source for dest[63:48] and dest[191:176]
//   The high 64 bits of each 128 bit section of the source are also copied to the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  VPSHUFLW,  // [RBX][15:0]   -> XMM0[15:0]
//                                  // [RBX][15:0]   -> XMM0[31:16]
//                                  // [RBX][15:0]   -> XMM0[47:32]
//                                  // [RBX][15:0]   -> XMM0[63:48]
//                                  // [RBX][127:64] -> XMM0[127:64]
//
//  E4 N  RBX [R]  XMM0  VPSHUFLW,   // [RBX][127:0]  -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM0  VPSHUFLW,  // [RBX][63:48]  -> XMM0[15:0]
//                                  // [RBX][63:48]  -> XMM0[31:16]
//                                  // [RBX][63:48]  -> XMM0[47:32]
//                                  // [RBX][63:48]  -> XMM0[63:48]
//                                  // [RBX][127:64] -> XMM0[127:64]
//
//  02 N  XMM2  XMM0  VPSHUFLW,     // XMM2[47:32]  -> XMM0[15:0]
//                                  // XMM2[15:0]   -> XMM0[31:16]
//                                  // XMM2[15:0]   -> XMM0[47:32]
//                                  // XMM2[15:0]   -> XMM0[63:48]
//                                  // XMM2[127:64] -> XMM0[127:64]
//
//  02 N  XMM0 <- XMM2  VPSHUFLW, // XMM2[47:32]  -> XMM0[15:0]
//                                   // XMM2[15:0]   -> XMM0[31:16]
//                                   // XMM2[15:0]   -> XMM0[47:32]
//                                   // XMM2[15:0]   -> XMM0[63:48]
//                                   // XMM2[127:64] -> XMM0[127:64]
//
//  02 N  YMM8  YMM0 VPSHUFLW,      // YMM8[47:32]   -> YMM0[15:0]
//                                  // YMM8[15:0]    -> YMM0[31:16]
//                                  // YMM8[15:0]    -> YMM0[47:32]
//                                  // YMM8[15:0]    -> YMM0[63:48]
//                                  // YMM8[127:64]  -> YMM0[127:64]
//                                  // YMM8[175:160] -> YMM0[143:128]
//                                  // YMM8[143:128] -> YMM0[159:144]
//                                  // YMM8[143:128] -> YMM0[175:160]
//                                  // YMM8[143:128] -> YMM0[191:176]
//                                  // YMM8[255:192] -> YMM0[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm, or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpshufwcomma ( PSHUFW, )
//
// C prototype:
//  void dg_forthpshufwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   FPSR                         specifies a floating point register target.
//                                 FPSR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PSHUFW instruction. This opcode sequence copies 16 bit values from
//   the source to the destination. Each pair of bits in the
//   8 bit immediate value determines which source value to copy into a
//   destination value.
//   immediatevalue[1:0] chooses source for dest[15:0]
//   immediatevalue[3:2] chooses source for dest[31:16]
//   immediatevalue[5:4] chooses source for dest[47:32]
//   immediatevalue[7:6] chooses source for dest[63:48]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  ST0  PSHUFW,     // [RBX][15:0]   -> ST0[15:0]
//                                  // [RBX][15:0]   -> ST0[31:16]
//                                  // [RBX][15:0]   -> ST0[47:32]
//                                  // [RBX][15:0]   -> ST0[63:48]
//                                  // [RBX][127:64] -> ST0[127:64]
//
//  E4 N  RBX [R]  ST0  PSHUFW,     // [RBX][127:0]  -> ST0[127:0]
//
//  FF N  RBX [R]  ST0  PSHUFW,     // [RBX][63:48]  -> ST0[15:0]
//                                  // [RBX][63:48]  -> ST0[31:16]
//                                  // [RBX][63:48]  -> ST0[47:32]
//                                  // [RBX][63:48]  -> ST0[63:48]
//                                  // [RBX][127:64] -> ST0[127:64]
//
//  02 N  ST2  ST0  PSHUFW,         // ST2[31:16]  -> ST0[15:0]
//                                  // ST2[31:16]  -> ST0[31:16]
//                                  // ST2[31:16]  -> ST0[47:32]
//                                  // ST2[31:16]  -> ST0[63:48]
//                                  // ST2[127:64] -> ST0[127:64]
//
//  02 N  ST0 <- ST2  PSHUFW,    // ST2[31:16]  -> ST0[15:0]
//                                  // ST2[31:16]  -> ST0[31:16]
//                                  // ST2[31:16]  -> ST0[47:32]
//                                  // ST2[31:16]  -> ST0[63:48]
//                                  // ST2[127:64] -> ST0[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthroundpdcomma ( ROUNDPD, )
//
// C prototype:
//  void dg_forthroundpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the immediate target can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 ROUNDPD instruction. This opcode sequence rounds each of the two
//   double precision floating point values up or down to the nearest double
//   precision floating point value representing an integer according to the
//   rounding mode selected in the immediate value and puts the results into
//   the destination.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  ROUNDPD,   // round([RBX][63:0])   -> XMM0[63:0]
//                                  // round([RBX][127:64]) -> XMM0[127:64]
//
//  01 N  RBX [R]  XMM0  ROUNDPD,   // rounddown([RBX][63:0])   -> XMM0[63:0]
//                                  // rounddown([RBX][127:64]) -> XMM0[127:64]
//
//  02 N  RBX [R]  XMM0  ROUNDPD,   // roundup([RBX][63:0])   -> XMM0[63:0]
//                                  // roundup([RBX][127:64]) -> XMM0[127:64]
//
//  03 N  XMM2  XMM0  ROUNDPD,      // truncate(XMM2[63:0])   -> XMM0[63:0]
//                                  // truncate(XMM2[127:64]) -> XMM0[127:64]
//
//  03 N  XMM0 <- XMM2  ROUNDPD, // truncate(XMM2[63:0])   -> XMM0[63:0]
//                                  // truncate(XMM2[127:64]) -> XMM0[127:64]
//
//  03 N  XMM8  XMM0 ROUNDPD,       // truncate(XMM8[63:0])   -> XMM0[63:0]
//                                  // truncate(XMM8[127:64]) -> XMM0[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvroundpdcomma ( VROUNDPD, )
//
// C prototype:
//  void dg_forthvroundpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VROUNDPD instruction. This opcode sequence rounds each double 
//   precision floating point value from the source up or down to the nearest 
//   double precision floating point value representing an integer according to 
//   the rounding mode selected in the immediate value and puts the results into
//   the destination.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  VROUNDPD,  // round([RBX][63:0])   -> XMM0[63:0]
//                                  // round([RBX][127:64]) -> XMM0[127:64]
//
//  01 N  RBX [R]  XMM0  VROUNDPD,  // rounddown([RBX][63:0])   -> XMM0[63:0]
//                                  // rounddown([RBX][127:64]) -> XMM0[127:64]
//
//  02 N  RBX [R]  XMM0  VROUNDPD,  // roundup([RBX][63:0])   -> XMM0[63:0]
//                                  // roundup([RBX][127:64]) -> XMM0[127:64]
//
//  03 N  XMM2  XMM0  VROUNDPD,     // truncate(XMM2[63:0])   -> XMM0[63:0]
//                                  // truncate(XMM2[127:64]) -> XMM0[127:64]
//
//  03 N  XMM0 <- XMM2  VROUNDPD,// truncate(XMM2[63:0])   -> XMM0[63:0]
//                                  // truncate(XMM2[127:64]) -> XMM0[127:64]
//
//  03 N  YMM8  YMM0 VROUNDPD,      // truncate(YMM8[63:0])    -> YMM0[63:0]
//                                  // truncate(YMM8[127:64])  -> YMM0[127:64]
//                                  // truncate(YMM8[191:128]) -> YMM0[191:128]
//                                  // truncate(YMM8[255:192]) -> YMM0[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthroundpscomma ( ROUNDPS, )
//
// C prototype:
//  void dg_forthroundpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 ROUNDPS instruction. This opcode sequence rounds each of the four
//   single precision floating point values up or down to the nearest single
//   precision floating point value representing an integer according to the
//   rounding mode selected in the immediate value and puts the results into
//   the destination.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  ROUNDPS,   // round([RBX][31:0])   -> XMM0[31:0]
//                                  // round([RBX][63:32])  -> XMM0[63:32]
//                                  // round([RBX][95:64])  -> XMM0[95:64]
//                                  // round([RBX][127:96]) -> XMM0[127:96]
//
//  01 N  RBX [R]  XMM0  ROUNDPS,   // rounddown([RBX][31:0])   -> XMM0[31:0]
//                                  // rounddown([RBX][63:32])  -> XMM0[63:32]
//                                  // rounddown([RBX][95:64])  -> XMM0[95:64]
//                                  // rounddown([RBX][127:96]) -> XMM0[127:96]
//
//  02 N  RBX [R]  XMM0  ROUNDPS,   // roundup([RBX][31:0])   -> XMM0[31:0]
//                                  // roundup([RBX][63:32])  -> XMM0[63:32]
//                                  // roundup([RBX][95:64])  -> XMM0[95:64]
//                                  // roundup([RBX][127:96]) -> XMM0[127:96]
//
//  03 N  XMM2  XMM0  ROUNDPS,      // truncate(XMM2[31:0])   -> XMM0[31:0]
//                                  // truncate(XMM2[63:32])  -> XMM0[63:32]
//                                  // truncate(XMM2[95:64])  -> XMM0[95:64]
//                                  // truncate(XMM2[127:96]) -> XMM0[127:96]
//
//  03 N  XMM0 <- XMM2  ROUNDPS, // truncate(XMM2[31:0])   -> XMM0[31:0]
//                                  // truncate(XMM2[63:32])  -> XMM0[63:32]
//                                  // truncate(XMM2[95:64])  -> XMM0[95:64]
//                                  // truncate(XMM2[127:96]) -> XMM0[127:96]
//
//  03 N  XMM8  XMM0 ROUNDPS,       // truncate(XMM8[31:0])   -> XMM0[31:0]
//                                  // truncate(XMM8[63:32])  -> XMM0[63:32]
//                                  // truncate(XMM8[95:64])  -> XMM0[95:64]
//                                  // truncate(XMM8[127:96]) -> XMM0[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvroundpscomma ( VROUNDPS, )
//
// C prototype:
//  void dg_forthvroundpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VROUNDPS instruction. This opcode sequence rounds each single 
//   precision floating point value from the source up or down to the nearest 
//   single precision floating point value representing an integer according to 
//   the rounding mode selected in the immediate value and puts the results into
//   the destination.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  VROUNDPS,  // round([RBX][31:0])   -> XMM0[31:0]
//                                  // round([RBX][63:32])  -> XMM0[63:32]
//                                  // round([RBX][95:64])  -> XMM0[95:64]
//                                  // round([RBX][127:96]) -> XMM0[127:96]
//
//  01 N  RBX [R]  XMM0  VROUNDPS,  // rounddown([RBX][31:0])   -> XMM0[31:0]
//                                  // rounddown([RBX][63:32])  -> XMM0[63:32]
//                                  // rounddown([RBX][95:64])  -> XMM0[95:64]
//                                  // rounddown([RBX][127:96]) -> XMM0[127:96]
//
//  02 N  RBX [R]  XMM0  VROUNDPS,  // roundup([RBX][31:0])   -> XMM0[31:0]
//                                  // roundup([RBX][63:32])  -> XMM0[63:32]
//                                  // roundup([RBX][95:64])  -> XMM0[95:64]
//                                  // roundup([RBX][127:96]) -> XMM0[127:96]
//
//  03 N  XMM2  XMM0  VROUNDPS,     // truncate(XMM2[31:0])   -> XMM0[31:0]
//                                  // truncate(XMM2[63:32])  -> XMM0[63:32]
//                                  // truncate(XMM2[95:64])  -> XMM0[95:64]
//                                  // truncate(XMM2[127:96]) -> XMM0[127:96]
//
//  03 N  XMM0 <- XMM2  VROUNDPS, // truncate(XMM2[31:0])   -> XMM0[31:0]
//                                   // truncate(XMM2[63:32])  -> XMM0[63:32]
//                                   // truncate(XMM2[95:64])  -> XMM0[95:64]
//                                   // truncate(XMM2[127:96]) -> XMM0[127:96]
//
//  03 N  YMM8  YMM0 VROUNDPS,      // truncate(YMM8[31:0])    -> YMM0[31:0]
//                                  // truncate(YMM8[63:32])   -> YMM0[63:32]
//                                  // truncate(YMM8[95:64])   -> YMM0[95:64]
//                                  // truncate(YMM8[127:96])  -> YMM0[127:96]
//                                  // truncate(YMM8[159:128]) -> YMM0[159:128]
//                                  // truncate(YMM8[191:160]) -> YMM0[191:160]
//                                  // truncate(YMM8[223:192]) -> YMM0[223:192]
//                                  // truncate(YMM8[255:224]) -> YMM0[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm, or ymm target. The immediate 
//   target's size is one byte, so if you use IMMEDIATE to specify a minimum size, 
//   it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthroundsdcomma ( ROUNDSD, )
//
// C prototype:
//  void dg_forthroundsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 ROUNDSD instruction. This opcode sequence rounds the double
//   precision floating point value in the low 64 bits of the source up or down
//   to the nearest double precision floating point value representing an integer
//   according to the rounding mode selected in the immediate value and puts the
//   result into the destination. The upper 64 bits of the destination are
//   unchanged.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  ROUNDSD,   // round([RBX][63:0])   -> XMM0[63:0]
//
//  01 N  RBX [R]  XMM0  ROUNDSD,   // rounddown([RBX][63:0])   -> XMM0[63:0]
//
//  02 N  RBX [R]  XMM0  ROUNDSD,   // roundup([RBX][63:0])   -> XMM0[63:0]
//
//  03 N  XMM2  XMM0  ROUNDSD,      // truncate(XMM2[63:0])   -> XMM0[63:0]
//
//  03 N  XMM0 <- XMM2  ROUNDSD, // truncate(XMM2[63:0])   -> XMM0[63:0]
//
//  03 N  XMM8  XMM0 ROUNDSD,       // truncate(XMM8[63:0])   -> XMM0[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvroundsdcomma ( VROUNDSD, )
//
// C prototype:
//  void dg_forthvroundsdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VROUNDSD instruction. This opcode sequence rounds the double
//   precision floating point value in the low 64 bits of the source up or down
//   to the nearest double precision floating point value representing an integer
//   according to the rounding mode selected in the immediate value and puts the
//   result into the destination. The upper 64 bits of the destination are copied
//   from the upper 64 bits of target y.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VROUNDSD,   
//                             // round([RBX][63:0])       -> XMM0[63:0]
//                             // XMM1[127:63]             -> XMM0[127:64]
//                             // 0                        -> YMM0[255:128]
//
//  01 N  RBX [R]  XMM1  XMM0  VROUNDSD,   
//                             // rounddown([RBX][63:0])   -> XMM0[63:0]
//                             // XMM1[127:63]             -> XMM0[127:64]
//                             // 0                        -> YMM0[255:128]
//
//  02 N  RBX [R]  XMM1  XMM0  VROUNDSD,   
//                             // roundup([RBX][63:0])     -> XMM0[63:0]
//                             // XMM1[127:63]             -> XMM0[127:64]
//                             // 0                        -> YMM0[255:128]
//
//  03 N  XMM2  XMM1  XMM0  VROUNDSD,      
//                             // truncate(XMM2[63:0])     -> XMM0[63:0]
//                             // XMM1[127:63]             -> XMM0[127:64]
//                             // 0                        -> YMM0[255:128]
//
//  03 N  XMM0 <- XMM1  XMM2  VROUNDSD, 
//                             // truncate(XMM2[63:0])     -> XMM0[63:0]
//                             // XMM1[127:63]             -> XMM0[127:64]
//                             // 0                        -> YMM0[255:128]
//
//  03 N  XMM8  XMM1  XMM0 VROUNDSD,       
//                             // truncate(XMM8[63:0])     -> XMM0[63:0]
//                             // XMM1[127:63]             -> XMM0[127:64]
//                             // 0                        -> YMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y must 
//   be an xmm register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthroundsscomma ( ROUNDSS, )
//
// C prototype:
//  void dg_forthroundsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 ROUNDSS instruction. This opcode sequence rounds single precision
//   floating point value in the lower 32 bits of the source up or down to the
//   nearest single precision floating point value representing an integer
//   according to the rounding mode selected in the immediate value and puts the
//   result into the destination.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  ROUNDSS,   // round([RBX][31:0])   -> XMM0[31:0]
//
//  01 N  RBX [R]  XMM0  ROUNDSS,   // rounddown([RBX][31:0])   -> XMM0[31:0]
//
//  02 N  RBX [R]  XMM0  ROUNDSS,   // roundup([RBX][31:0])   -> XMM0[31:0]
//
//  03 N  XMM2  XMM0  ROUNDSS,      // truncate(XMM2[31:0])   -> XMM0[31:0]
//
//  03 N  XMM0 <- XMM2  ROUNDSS, // truncate(XMM2[31:0])   -> XMM0[31:0]
//
//  03 N  XMM8  XMM0 ROUNDSS,       // truncate(XMM8[31:0])   -> XMM0[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvroundsscomma ( VROUNDSS, )
//
// C prototype:
//  void dg_forthvroundsscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets y can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for these targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VROUNDSS instruction. This opcode sequence rounds single precision
//   floating point value in the lower 32 bits of the source up or down to the
//   nearest single precision floating point value representing an integer
//   according to the rounding mode selected in the immediate value and puts the
//   result into the destination. This opcode sequence also copies the upper 96
//   bits of target y to the upper 96 bits of the destination.
//
//   immediatevalue[1:0]   meaning
//    0                     round to closest integer, in the event of a tie,
//                           round to the closest even integer
//    1                     round towards negative infinity to closest integer
//    2                     round towards positive infinity to closest integer
//    3                     round towards 0 to closest integer
//
//   immediatevalue[2]     meaning
//    0                     use immediatevalue[1:0] to determine the rounding mode
//    1                     use MXCSR.RC to determine the rounding mode
//
//   immediatevalue[3]     meaning
//    0                     normal precision mask
//    1                     inexact precision mask
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VROUNDSS,   
//                         // round([RBX][31:0])     -> XMM0[31:0]
//                         // XMM1[127:32]           -> XMM0[127:32]
//                         // 0                      -> YMM0[255:128]
//
//  01 N  RBX [R]  XMM1  XMM0  VROUNDSS,   
//                         // rounddown([RBX][31:0]) -> XMM0[31:0]
//                         // XMM1[127:32]           -> XMM0[127:32]
//                         // 0                      -> YMM0[255:128]
//
//  02 N  RBX [R]  XMM1  XMM0  VROUNDSS,   
//                         // roundup([RBX][31:0])   -> XMM0[31:0]
//                         // XMM1[127:32]           -> XMM0[127:32]
//                         // 0                      -> YMM0[255:128]
//
//  03 N  XMM2  XMM1  XMM0  VROUNDSS,      
//                         // truncate(XMM2[31:0])   -> XMM0[31:0]
//                         // XMM1[127:32]           -> XMM0[127:32]
//                         // 0                      -> YMM0[255:128]
//
//  03 N  XMM0 <- XMM1  XMM2  VROUNDSS, 
//                         // truncate(XMM2[31:0])   -> XMM0[31:0]
//                         // XMM1[127:32]           -> XMM0[127:32]
//                         // 0                      -> YMM0[255:128]
//
//  03 N  XMM8  XMM1  XMM0 VROUNDSS,       
//                         // truncate(XMM8[31:0])   -> XMM0[31:0]
//                         // XMM1[127:32]           -> XMM0[127:32]
//                         // 0                      -> YMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory or xmm target. The immediate 
//   target's size is one byte, so if you use IMMEDIATE to specify a minimum
//    size, it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshufpdcomma ( SHUFPD, )
//
// C prototype:
//  void dg_forthshufpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 SHUFPD instruction. This opcode sequence copies 64 bit values from
//   the source to the destination. The low two bits in the 8 bit immediate
//   value determines which source value to copy into a destination value.
//   immediatevalue[0] chooses source for dest[63:0] from either dest[127:64] or
//   dest[63:0]
//   immediatevalue[1] chooses source for dest[127:64] from either 
//   source[127:64] or source[63:0]
//  Intel docs says it moves double floating point values, but it does not
//   matter what kind of values they are.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  SHUFPD,   // XMM0[63:0]  -> XMM0[63:0]
//                                 // [RBX][63:0] -> XMM0[127:64]
//
//  02 N  RBX [R]  XMM0  SHUFPD,   // XMM0[63:0]    -> XMM0[63:0]
//                                 // [RBX][127:64] -> XMM0[127:64]
//
//  03 N  RBX [R]  XMM0  SHUFPD,   // XMM0[127:64]  -> XMM0[63:0]
//                                 // [RBX][127:64] -> XMM0[127:64]
//
//  03 N  XMM2  XMM0  SHUFPD,      // XMM0[127:64] -> XMM0[63:0]
//                                 // XMM2[127:64] -> XMM0[127:64]
//
//  03 N  XMM0 <- XMM2  SHUFPD, // XMM0[127:64] -> XMM0[63:0]
//                                 // XMM2[127:64] -> XMM0[127:64]
//
//  03 N  XMM8  XMM0 SHUFPD,       // XMM0[127:64] -> XMM0[63:0]
//                                 // XMM8[127:64] -> XMM0[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvshufpdcomma ( VSHUFPD, )
//
// C prototype:
//  void dg_forthvshufpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VSHUFPD instruction. This opcode sequence copies 64 bit values from
//   the source and target y to the destination. The low two bits in the 8 bit 
//   immediate value determines which source and target y value to copy from 
//   each 128 bit section of the source and target y into a destination value.
//   immediatevalue[0] chooses sources for dest[63:0] and dest[191:127]
//   immediatevalue[1] chooses source for dest[127:64] and dest[255:192]
//  Intel docs says it moves double floating point values, but it does not
//   matter what kind of values they are.
//
//   immediatevalue[0]   source for dest[63:0]  source for dest[191:128]
//    0                   targety[63:0]          targety[191:128]
//    1                   targety[127:64]        targety[255:128]
//
//   immediatevalue[1]   source for dest[127:64] source for dest[255:192]
//    0                   source[63:0]            source[191:128]
//    1                   source[127:64]          source[255:128]            
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VSHUFPD,   
//                          // XMM1[63:0]  -> XMM0[63:0]
//                          // [RBX][63:0] -> XMM0[127:64]
//
//  02 N  RBX [R]  XMM1  XMM0  VSHUFPD, 
//                          // XMM1[63:0]   -> XMM0[63:0]  
//                          // [RBX][127:0] -> XMM0[127:0]
//
//  03 N  RBX [R]  XMM1  XMM0  VSHUFPD,   
//                          // XMM1[127:64]  -> XMM0[63:0]
//                          // [RBX][127:64] -> XMM0[127:64]
//
//  03 N  XMM2  XMM1  XMM0  VSHUFPD,      
//                          // XMM1[127:64] -> XMM0[63:0]
//                          // XMM2[127:64] -> XMM0[127:64]
//
//  03 N  XMM0 <- XMM1  XMM2  VSHUFPD, 
//                         // XMM1[127:64] -> XMM0[63:0]
//                         // XMM2[127:64] -> XMM0[127:64]
//
//  03 N  XMM8  XMM1  XMM0 VSHUFPD,       
//                         // XMM1[127:64] -> XMM0[63:0]
//                         // XMM8[127:64] -> XMM0[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm, or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshufpscomma ( SHUFPS, )
//
// C prototype:
//  void dg_forthshufpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 SHUFPS instruction. This opcode sequence copies 32 bit values from
//   the source to the destination. Each pair of bits in the 8 bit immediate
//   value determines which source or destination value to copy into a 
//   destination value.
//   immediatevalue[1:0] chooses source for dest[31:0] from each 32 bit section
//    of the dest
//   immediatevalue[3:2] chooses source for dest[63:32] from each 32 bit section
//    of the dest
//   immediatevalue[5:4] chooses source for dest[95:64] from each 32 bit section
//    of the source
//   immediatevalue[7:6] chooses source for dest[127:96] from each 32 bit section
//    of the source
//  Intel docs says it moves single floating point values, but it does not
//   matter what kind of values they are. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  SHUFPS,   // XMM0[31:0]  -> XMM0[31:0]
//                                 // XMM0[31:0]  -> XMM0[63:32]
//                                 // [RBX][31:0] -> XMM0[95:64]
//                                 // [RBX][31:0] -> XMM0[127:96]
//
//  E4 N  RBX [R]  XMM0  SHUFPS,   // XMM0[31:0]   -> XMM0[31:0]
//                                 // XMM0[63:32]  -> XMM0[63:32]
//                                 // [RBX][127:0] -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM0  SHUFPS,   // XMM0[127:96]  -> XMM0[31:0]
//                                 // XMM0[127:96]  -> XMM0[63:32]
//                                 // [RBX][127:96] -> XMM0[95:64]
//                                 // [RBX][127:96] -> XMM0[127:96]
//
//  02 N  XMM2  XMM0  SHUFPS,      // XMM0[63:32] -> XMM0[31:0]
//                                 // XMM0[31:0]  -> XMM0[63:32]
//                                 // XMM2[31:0]  -> XMM0[95:64]
//                                 // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM2  SHUFPS, // XMM0[63:32] -> XMM0[31:0]
//                                 // XMM0[31:0]  -> XMM0[63:32]
//                                 // XMM2[31:0]  -> XMM0[95:64]
//                                 // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  XMM8  XMM0 SHUFPS,       // XMM0[63:32] -> XMM0[31:0]
//                                 // XMM0[31:0]  -> XMM0[63:32]
//                                 // XMM8[31:0]  -> XMM0[95:64]
//                                 // XMM8[31:0]  -> XMM0[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvshufpscomma ( VSHUFPS, )
//
// C prototype:
//  void dg_forthvshufpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VSHUFPS instruction. This opcode sequence copies 32 bit values from
//   the source or target y to the destination. Each pair of bits in the 8 bit 
//   immediate value determines which source or target y value to copy into a  
//   destination value for each 128 bit section of the source, target y, and
//   the destination.
//   immediatevalue[1:0] chooses source for dest[31:0] and dest[159:128] 
//    from the destination
//   immediatevalue[3:2] chooses source for dest[63:32] and dest[191:160] 
//    from the destination
//   immediatevalue[5:4] chooses source for dest[95:64] and dest[223:192] 
//    from the source
//   immediatevalue[7:6] chooses source for dest[127:96] and dest[255:224]
//    from the source
//  Intel docs says it moves single floating point values, but it does not
//   matter what kind of values they are. 
//
//   immediatevalue[1:0]    source for dest[31:0]   source for dest[159:128]
//    0                      dest[31:0]              dest[159:128]
//    1                      dest[63:32]             dest[191:160]
//    2                      dest[95:64]             dest[223:192]
//    3                      dest[127:96]            dest[255:224]
//
//   immediatevalue[3:2]    source for dest[63:32]   source for dest[191:160]
//    0                      dest[31:0]              dest[159:128]
//    1                      dest[63:32]             dest[191:160]
//    2                      dest[95:64]             dest[223:192]
//    3                      dest[127:96]            dest[255:224]
//
//   immediatevalue[5:4]    source for dest[95:64]   source for dest[223:192]
//    0                      source[31:0]             source[159:128]
//    1                      source[63:32]            source[191:160]
//    2                      source[95:64]            source[223:192]
//    3                      source[127:96]           source[255:224]
//
//   immediatevalue[7:6]    source for dest[127:96]   source for dest[255:224]
//    0                      source[31:0]             source[159:128]
//    1                      source[63:32]            source[191:160]
//    2                      source[95:64]            source[223:192]
//    3                      source[127:96]           source[255:224]
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VSHUFPS,   
//                                 // XMM1[31:0]  -> XMM0[31:0]
//                                 // XMM1[31:0]  -> XMM0[63:32]
//                                 // [RBX][31:0] -> XMM0[95:64]
//                                 // [RBX][31:0] -> XMM0[127:96]
//
//  E4 N  RBX [R]  XMM1  XMM0  VSHUFPS,   
//                                 // XMM1[31:0]   -> XMM0[31:0]
//                                 // XMM1[63:32]  -> XMM0[63:32]
//                                 // [RBX][127:0] -> XMM0[127:0]
//
//  FF N  RBX [R]  XMM1  XMM0  VSHUFPS,   
//                                 // XMM1[127:96]  -> XMM0[31:0]
//                                 // XMM1[127:96]  -> XMM0[63:32]
//                                 // [RBX][127:96] -> XMM0[95:64]
//                                 // [RBX][127:96] -> XMM0[127:96]
//
//  02 N  XMM2  XMM1  XMM0  VSHUFPS,      
//                                 // XMM1[63:32] -> XMM0[31:0]
//                                 // XMM1[31:0]  -> XMM0[63:32]
//                                 // XMM2[31:0]  -> XMM0[95:64]
//                                 // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  XMM0 <- XMM1  XMM2  VSHUFPS, 
//                                 // XMM1[63:32] -> XMM0[31:0]
//                                 // XMM1[31:0]  -> XMM0[63:32]
//                                 // XMM2[31:0]  -> XMM0[95:64]
//                                 // XMM2[31:0]  -> XMM0[127:96]
//
//  02 N  YMM8  XMM1  YMM0 VSHUFPS,       
//                                 // YMM1[63:32]   -> YMM0[31:0]
//                                 // YMM1[31:0]    -> YMM0[63:32]
//                                 // YMM8[31:0]    -> YMM0[95:64]
//                                 // YMM8[31:0]    -> YMM0[127:96]
//                                 // YMM1[191:160] -> YMM0[159:128]
//                                 // YMM1[159:128] -> YMM0[191:160]
//                                 // YMM8[159:128] -> YMM0[223:192]
//                                 // YMM8[159:128] -> YMM0[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm, or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpalignrcomma ( PALIGNR, )
//
// C prototype:
//  void dg_forthpalignrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PALIGNR instruction. This opcode sequence concatenates the
//   destination and source with the destination as the high part, and the
//   source as the low part, then logical shifts the value to the right by the
//   number of bytes specified in the immediate value. A logical shift means
//   zeroes are shifted in from the left. The result is put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM0  PALIGNR,  // nothing happens
//
//  00 N  RBX [R]  ST0  PALIGNR,   // nothing happens
//
//  01 N  RBX [R]  XMM0  PALIGNR,  // [RBX][7:0]  -> XMM0[127:120]
//                                 // XMM0[127:8] -> XMM0[119:0]
//
//  01 N  RBX [R]  ST0  PALIGNR,   // [RBX][7:0]  -> ST0[63:56]
//                                 // XMM0[63:8]  -> XMM0[55:0]
//
//  10 N  RBX [R]  XMM0  PALIGNR,  // [RBX][127:0] -> XMM0[127:0]
//
//  08 N  RBX [R]  ST0  PALIGNR,   // [RBX][63:0]  -> ST0[63:0]
//
//  20 N  RBX [R]  XMM0  PALIGNR,  //            0 -> XMM0[127:0]
//
//  10 N  RBX [R]  ST0  PALIGNR,   //            0 -> XMM0[127:0]
//
//  02 N  XMM2  XMM0  PALIGNR,     // XMM2[15:0]   -> XMM0[127:112]
//                                 // XMM0[127:16] -> XMM0[111:0]
//
//  02 N  XMM0 <- XMM2  PALIGNR, // XMM2[15:0]   -> XMM0[127:112]
//                                  // XMM0[127:16] -> XMM0[111:0]
//
//  02 N  XMM8  XMM0 PALIGNR,       // XMM8[15:0]   -> XMM0[127:112]
//                                  // XMM0[127:16] -> XMM0[111:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register. The first target must be the immediate target. If you use
//   -> it must come after a memory or xmm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpalignrcomma ( VPALIGNR, )
//
// C prototype:
//  void dg_forthvpalignrcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPALIGNR instruction. This opcode sequence concatenates each 16 byte
//   section of target y and each 16 byte section of the source with the 
//   target y sections as the high part of the 32 byte intermediate result, 
//   and the source as the low part of the 32 byte intermediate result, 
//   then logical shifts the 32 byte intermediate value to the right by the number 
//   of bytes specified in the immediate value. A logical shift means zeroes are 
//   shifted in from the left. The lower 16 bytes of each intermediate result
//   are put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VPALIGNR,  // nothing happens
//
//  01 N  RBX [R]  XMM1  XMM0  VPALIGNR,  
//                                 // [RBX][7:0]  -> XMM0[127:120]
//                                 // XMM1[127:8] -> XMM0[119:0]
//
//  10 N  RBX [R]  XMM1  XMM0  VPALIGNR,  // [RBX][127:0] -> XMM0[127:0]
//
//  20 N  RBX [R]  XMM1  XMM0  VPALIGNR,  //            0 -> XMM0[127:0]
//
//  02 N  XMM2  XMM1  XMM0  VPALIGNR,     // XMM2[15:0]   -> XMM0[127:112]
//                                        // XMM1[127:16] -> XMM0[111:0]
//
//  02 N  XMM0 <- XMM1  XMM2  VPALIGNR, // XMM2[15:0]   -> XMM0[127:112]
//                                         // XMM1[127:16] -> XMM0[111:0]
//
//  02 N  YMM8  YMM1  YMM0 VPALIGNR,    // YMM8[15:0]    -> YMM0[127:112]
//                                      // YMM1[127:16]  -> YMM0[111:0]
//                                      // YMM8[143:128] -> YMM0[255:240]
//                                      // YMM1[255:144] -> YMM0[239:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm or ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, xmm target, or target y.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpblenddcomma ( VPBLENDD, )
//
// C prototype:
//  void dg_forthvpblenddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPBLENDD instruction. This opcode sequence copies each 32 bit 
//   value from either the source or target y depending on whether or not the
//   bit for that section is set or clear in the immediate target. If a bit
//   for a section in the immediate target is set, the source value gets copied.
//   If a bit for a section in the immediate target is clear, the target y
//   value gets copied. The lowest bit in the immediate target determines
//   which of the lowest 32 bit values in either target y or the source gets
//   copied to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VPBLENDD,  // XMM1 -> XMM0
//
//  0F N  RBX [R]  XMM1  XMM0  VPBLENDD,  // [RBX][127:0] -> XMM0[127:0]
//
//  0C N  RBX [R]  XMM1  XMM0  VPBLENDD,  // XMM1[63:0]    -> XMM0[63:0]
//                                        // [RBX][127:64] -> XMM0[127:64]
//
//  F0 N  RBX [R]  YMM1  YMM0  VPBLENDD,  // YMM1[127:0]    -> YMM0[127:0]
//                                        // [RBX][255:128] -> YMM0[255:128]
//
//  F0 N  YMM2  YMM1  YMM0  VPBLENDD,     // YMM1[127:0]   -> YMM0[127:0]
//                                        // YMM2[255:128] -> YMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm or ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, xmm target, or target y.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpblendwcomma ( VPBLENDW, )
//
// C prototype:
//  void dg_forthvpblendwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPBLENDW instruction. This opcode sequence copies each 16 bit 
//   value from either the source or target y depending on whether or not the
//   bit for that section is set or clear in the immediate target. If a bit
//   for a section in the immediate target is set, the source value gets copied.
//   If a bit for a section in the immediate target is clear, the target y
//   value gets copied. The lowest bit in the immediate target determines
//   which of the lowest 16 bit values in either target y or the source gets
//   copied to the destination. If the destination register is a ymm register,
//   then the lowest bit in the immediate target also determines which of the
//   lowest 16 bit values in the upper 128 bits of either target y or the
//   source gets copied.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  XMM1  XMM0  VPBLENDW,  // XMM1 -> XMM0
//
//  FF N  RBX [R]  XMM1  XMM0  VPBLENDW,  // [RBX][127:0] -> XMM0[127:0]
//
//  F0 N  RBX [R]  XMM1  XMM0  VPBLENDW,  // XMM1[63:0]    -> XMM0[63:0]
//                                        // [RBX][127:64] -> XMM0[127:64]
//
//  F0 N  RBX [R]  YMM1  YMM0  VPBLENDW,  // YMM1[63:0]     -> YMM0[63:0]
//                                        // [RBX][127:64]  -> YMM0[127:64]
//                                        // YMM1[191:128]  -> YMM0[191:128]
//                                        // [RBX][255:192] -> YMM0[255:192]
//
//  F1 N  YMM2  YMM1  YMM0  VPBLENDW,     // YMM2[15:0]    -> YMM0[15:0]
//                                        // YMM1[63:16]   -> YMM0[63:16]
//                                        // YMM2[127:64]  -> YMM0[127:64]
//                                        // YMM2[143:128] -> YMM0[143:128]
//                                        // YMM1[191:144] -> YMM0[191:144]
//                                        // YMM2[255:192] -> YMM0[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm or ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, xmm target, or target y.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpblendvbcomma ( VPBLENDVB, )
//
// C prototype:
//  void dg_forthvpblendvbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for target y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPBLENDVB instruction. This opcode sequence copies each byte
//   value from either the source or target y depending on whether or not the
//   high bit for corresponding byte is set or clear in the xmm or ymm register 
//   specified by the immediate target. If the high bit for a byte in the xmm 
//   or ymm register specified by the immediate target is set, the source byte 
//   gets copied, otherwise the target y byte gets copied.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  IF XMM2 = 0
//   20 N  RBX [R]  XMM1  XMM0  VPBLENDVB,  // XMM1 -> XMM0
//
//  IF XMM2 = 8080808080808080
//   20 N  RBX [R]  XMM1  XMM0  VPBLENDVB,  // [RBX][127:0] -> XMM0[127:0]
//
//  IF XMM2 = 8080808080808080
//   XMM2  RBX [R]  XMM1  XMM0  VPBLENDVB,  // [RBX][127:0] -> XMM0[127:0]
//
//  IF XMM2 = 8080808000000000
//   20 N  RBX [R]  XMM1  XMM0  VPBLENDVB,  // XMM1[63:0]    -> XMM0[63:0]
//                                          // [RBX][127:64] -> XMM0[127:64]
//
//  IF YMM2 = 80808080808080800000000000000000
//   YMM2  RBX [R]  YMM1  YMM0  VPBLENDVB,  // YMM1[127:0]    -> XMM0[127:0]
//                                          // [RBX][255:128] -> XMM0[255:128]
//
//  IF YMM2 = 80808080808080800000000000000080  
//   YMM2  RBX [R]  YMM1  YMM0  VPBLENDVB,  // [RBX][7:0]     -> XMM0[7:0]
//                                          // YMM1[127:8]    -> XMM0[127:8]
//                                          // [RBX][255:128] -> XMM0[255:128]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be an xmm or ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, xmm target, or target y.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaltooutn8comma ( AL->OUT[N8], )
//
// C prototype:
//  void dg_forthaltooutn8comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n8 -- )
//
// Data stack in:
//
//  n8
//
//  Description of target parameters:
//
//   n8               a number from 0 to 255
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 AL->OUT[N8], instruction. The sequence copies an 8 bit value from
//   AL to i/o port n8.
//   The number n8 must be in the range of an 8 bit integer and gets 0 extended
//   to 16 bits to form the i/o port address.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  AL->OUT[N8],                  // does AL->OUT[28],
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_fortheaxtooutn8comma ( EAX->OUT[N8], )
//
// C prototype:
//  void dg_fortheaxtooutn8comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n8 -- )
//
// Data stack in:
//
//  n8
//
//  Description of target parameters:
//
//   n8               a number from 0 to 255
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 EAX->OUT[N8], instruction. The sequence copies a 32 bit value from
//   EAX to i/o port n8.
//   The number n8 must be in the range of an 8 bit integer and gets 0 extended
//   to 16 bits to form the i/o port address.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  EAX->OUT[N8],                  // does EAX->OUT[28],
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthaxtooutn8comma ( AX->OUT[N8], )
//
// C prototype:
//  void dg_forthaxtooutn8comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n8 -- )
//
// Data stack in:
//
//  n8
//
//  Description of target parameters:
//
//   n8               a number from 0 to 255
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 AX->OUT[N8], instruction. The sequence copies a 16 bit value from
//   AX to i/o port n8.
//   The number n8 must be in the range of an 8 bit integer and gets 0 extended
//   to 16 bits to form the i/o port address.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 N  AX->OUT[N8],                  // does AX->OUT[28],
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxbeginn16comma ( XBEGINN16, )
//
// C prototype:
//  void dg_forthxbeginn16comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n16 -- )
//
// Data stack in:
//
//  n16
//
//  Description of target parameters:
//
//   n16               a number in the range of a signed 16 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 XBEGINN16, instruction. This sequence marks the start of an RTM
//   region, and the 16 bit integer is the offset from the address after
//   the end of this opcode sequence to the code to branch to in the event of
//   an RTM abort.
//   The number n16 must be in the range of a signed 16 bit integer.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  28 XBEGINN16,                 // starts an RTM region with an abort
//                                //  address of +28 from the address
//                                //  after the end of this instruction
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxbeginn32comma ( XBEGINN32, )
//
// C prototype:
//  void dg_forthxbeginn32comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( n32 -- )
//
// Data stack in:
//
//  n32
//
//  Description of target parameters:
//
//   n32               a number in the range of a signed 32 bit integer
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one number from the data stack and compiles the opcode sequence for
//   an x86 XBEGINN32, instruction. This sequence marks the start of an RTM
//   region, and the 32 bit integer is the offset from the address after
//   the end of this opcode sequence to the code to branch to in the event of
//   an RTM abort.
//   The number n32 must be in the range of a signed 32 bit integer.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  -428 XBEGINN32,               // starts an RTM region with an abort
//                                //  address of -428 from the address
//                                //  after the end of this instruction
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxbeginbranchcomma ( XBEGINBRANCH, )
//
// C prototype:
//  void dg_forthxbeginbranchcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( -- afterbranchoffset )
//
// Data stack out:
//
//  UINT64        afterbranchoffset   offset in bytes from the beginning of the
//                                     current compile buffer immediately after
//                                     the compiled branch instruction
//
// Execute state action:
//  This compiling word compiles the opcode sequence for 0 XBEGINN32,
//  then pushes the current compile buffer's current length to the data stack.
//  This is so you can use a RESOLVE-BRANCH or THEN to mark where an
//  rtm abort goes instead of having to use a label or calculate the offset.
//  The branch range is limited to a signed 32 bit integer.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  OHERE                      // put abort offset onto the data stack
//   ( your RTM abort code )
//   RET,                      // I'm just guessing... (what's an RTM region? :-)
//
//  XBEGINBRANCH,              // starts an RTM region with abort offset
//                             //  to be determined later
//  RESOLVE-BRANCH             // resolve the abort offset... no label required
//   ( your rtm region stuff )
//
//  XBEGINBRANCH,              // starts an RTM region with abort offset
//                             //  to be determined later
//   your rtm region stuff
//   XEND,                     // marks the end of the RTM region
//   ( more stuff )
//   RET,                      // I'm just guessing... (what's an RTM region? :-)
//
//  THEN                       // resolve the abort offset
//   ( your RTM abort code )
//   RET,                      // I'm just guessing... (what's an RTM region? :-)
//
//
// Note:
//  XBEGINBRANCH, uses the data stack to remember the after branch offset
//   so you have to be careful not to put any extra stuff on the stack or
//   accidently delete the offset.
//
//  If you don't like using THEN you can also use
//   OHERE SWAP RESOLVE-BRANCH
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthprefetchntacomma ( PREFETCHNTA, )
//
// C prototype:
//  void dg_forthprefetchntacomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//  targetxparameterlist
//
//  The parameter list for target x can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 PREFETCHNTA instruction. This opcode sequence gives a hint to the
//   processor that if the processor prefetches the byte at the address
//   specified by this target, to prefetch data into a non temporal cache
//   structure and closer to the processor.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  RBX [R]  PREFETCHNTA,  // give a hint to prefetch memory at address RBX
//                          /  closer to the processor
//
// Note:
//  Intel docs say program execution is not affected, and processors can
//   implement this however they want, including ingoring it. (Prefetching is
//   when a processor guesses at what your code might do next and works ahead.)
//
//  Data size is not required, but if you specify a data size for this
//   instruction it must be 8BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthprefetcht0comma ( PREFETCHT0, )
//
// C prototype:
//  void dg_forthprefetcht0comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//  targetxparameterlist
//
//  The parameter list for target x can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 PREFETCHT0 instruction. This opcode sequence gives a hint to the
//   processor that if the processor prefetches the byte at the address
//   specified by this target, to prefetch data into all levels of the cache
//   hierarchy.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  RBX [R]  PREFETCHT0,  // give a hint to prefetch memory at address RBX
//                        //  to all levels of the cache hierarchy
//
// Note:
//  Intel docs say program execution is not affected, and processors can
//   implement this however they want, including ingoring it. (Prefetching is
//   when a processor guesses at what your code might do next and works ahead.)
//
//  Data size is not required, but if you specify a data size for this
//   instruction it must be 8BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthprefetcht1comma ( PREFETCHT1, )
//
// C prototype:
//  void dg_forthprefetcht1comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//  targetxparameterlist
//
//  The parameter list for target x can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 PREFETCHT1 instruction. This opcode sequence gives a hint to the
//   processor that if the processor prefetches the byte at the address
//   specified by this target, to prefetch data into the level 2 cache
//   or higher.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  RBX [R]  PREFETCHT1,  // give a hint to prefetch memory at address RBX
//                        //  to level 2 of the cache hierarchy or higher
//
// Note:
//  Intel docs say program execution is not affected, and processors can
//   implement this however they want, including ingoring it. (Prefetching is
//   when a processor guesses at what your code might do next and works ahead.)
//
//  Data size is not required, but if you specify a data size for this
//   instruction it must be 8BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthprefetcht2comma ( PREFETCHT2, )
//
// C prototype:
//  void dg_forthprefetcht2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//  targetxparameterlist
//
//  The parameter list for target x can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 PREFETCHT2 instruction. This opcode sequence gives a hint to the
//   processor that if the processor prefetches the byte at the address
//   specified by this target, to prefetch data into the level 2 cache
//   or higher.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  RBX [R]  PREFETCHT2,  // give a hint to prefetch memory at address RBX
//                        //  to level 2 of the cache hierarchy or higher
//
// Note:
//  Intel docs say program execution is not affected, and processors can
//   implement this however they want, including ingoring it. (Prefetching is
//   when a processor guesses at what your code might do next and works ahead.)
//
//  Data size is not required, but if you specify a data size for this
//   instruction it must be 8BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthprefetchwcomma ( PREFETCHW, )
//
// C prototype:
//  void dg_forthprefetchwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//  targetxparameterlist
//
//  The parameter list for target x can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 PREFETCHW instruction. This opcode sequence gives a hint to the
//   processor that if the processor prefetches the byte at the address
//   specified by this target, to prefetch data closer to the processor and
//   expect a write.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  RBX [R]  PREFETCHW,  // give a hint to prefetch memory at address RBX
//                       //  closer to the processor for a write
//
// Note:
//  Intel docs say program execution is not affected, and processors can
//   implement this however they want, including ingoring it. (Prefetching is
//   when a processor guesses at what your code might do next and works ahead.)
//
//  Data size is not required, but if you specify a data size for this
//   instruction it must be 8BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthprefetchwt1comma ( PREFETCHWT1, )
//
// C prototype:
//  void dg_forthprefetchwt1comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist -- )
//
// Data stack in:
//  targetxparameterlist
//
//  The parameter list for target x can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one targets from the data stack and compiles the opcode sequence for
//   an x86 PREFETCHWT1 instruction. This opcode sequence gives a hint to the
//   processor that if the processor prefetches the byte at the address
//   specified by this target, to prefetch data with a PREFETCHT1 hint and
//   expect a write. PREFETCHT1 means try to prefetch into the level 2 cache
//   or above.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  RBX [R]  PREFETCHTWT1,  // give a hint to prefetch memory at address RBX
//                          //  using a T1 hint for a write
//
// Note:
//  Intel docs say program execution is not affected, and processors can
//   implement this however they want, including ingoring it. (Prefetching is
//   when a processor guesses at what your code might do next and works ahead.)
//
//  Data size is not required, but if you specify a data size for this
//   instruction it must be 8BIT.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsldtcomma ( SLDT, )
//
// C prototype:
//  void dg_forthsldtcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 SLDT instruction. This opcode stores the 16 bit value from the
//   the LDTR register into the destination.
//   The LDTR register is the local table descriptor register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  SLDT,             // LDTR -> [RAX]
//
// Note:
//  Data size is not required for this instruction and is ignored except
//   for specifying a data size of 64 bits. If you specify 64BIT you
//   will get the REX.W prefix but it have no effect Only 16 bits will
//   be copied.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsmswcomma ( SMSW, )
//
// C prototype:
//  void dg_forthsmswcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 SMSW instruction. This opcode stores the 16 bit value from the
//   machine status word bits of register CR0 into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  SMSW,             // CR0[15:0] -> [RAX]
//
// Note:
//  Data size is not required for this instruction and is ignored except
//   for specifying a data size of 64 bits. If you specify 64BIT you
//   will get the REX.W prefix. Intel docs don't say what will happen if
//   there is a REX.W prefix but it will probably be ignored.
//
//  Intel docs say something about not using this instruction on newer
//   processors and prefers you use the MOVCR instruction instead.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstrcomma ( STR, )
//
// C prototype:
//  void dg_forthstrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target. If this is used
//                                 and a register A opcode sequence for this
//                                 instruction is available, the register A
//                                 opcode sequence is NOT used. opcode+r, or
//                                 modr/m is used instead.
//                                 R is optional.
//                                 If a targetregister is specified without R,
//                                 and AL, AX, EAX, or RAX is specified,
//                                 and there is a register A opcode sequence
//                                 available, then the register A opcode
//                                 sequence is used.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 STR instruction. This opcode stores the 16 bit value from the
//   the TR register into the destination. The TR register is the task
//   register which I think holds a pointer to the task state segment of
//   the currently running task.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  STR,             // STR -> [RAX]
//
// Note:
//  Data size is not required for this instruction and is ignored except
//   for specifying a data size of 64 bits. If you specify 64BIT you
//   will get the REX.W prefix. Intel docs don't say what will happen if
//   there is a REX.W prefix but it will probably be ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsgdtcomma ( SGDT, )
//
// C prototype:
//  void dg_forthsgdtcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 SGDT instruction. In 32 bit mode, this opcode sequence copies a
//   48 bit value from the GDTR register to the destination. In 64 bit mode,
//   this opcode sequence copies an 80 bit value from the GDTR register.
//   The value copied is an address plus a 16 bit value. The 16 bit value
//   represents a length in bytes.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  SGDT,          //  GDTR > [RAX]
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthsidtcomma ( SIDT, )
//
// C prototype:
//  void dg_forthsidtcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 SIDT instruction. In 32 bit mode, this opcode sequence copies a
//   48 bit value from the IDTR register to the destination. In 64 bit mode,
//   this opcode sequence copies an 80 bit value from the IDTR register.
//   The value copied is an address plus a 16 bit length in bytes.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  SIDT,          //  IDTR -> [RAX]
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstmxcsrcomma ( STMXCSR, )
//
// C prototype:
//  void dg_forthstmxcsrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 STMXCSR instruction. In 32 bit mode, this opcode sequence copies a
//   32 bit value from the MXCSR register to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  STMXCSR,          //  [RAX] ->  MXCSR
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvstmxcsrcomma ( VSTMXCSR, )
//
// C prototype:
//  void dg_forthvstmxcsrcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 VSTMXCSR instruction. In 32 bit mode, this opcode sequence copies a
//   32 bit value from the MXCSR register to the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  VSTMXCSR,          //  [RAX] ->  MXCSR
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxrstorcomma ( XRSTOR, )
//
// C prototype:
//  void dg_forthxrstorcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XRSTOR instruction. This code sequence restores the processor state
//   from the memory at the source that was previously set up using one of the
//   XSAVE instructions.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XRSTOR,          //  it's kinda compilicated... please see your
//                            //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxrstor64comma ( XRSTOR64, )
//
// C prototype:
//  void dg_forthxrstor64comma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XRSTOR64 instruction. This code sequence restores the processor state
//   from the memory at the source that was previously set up using one of the
//   XSAVE instructions.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XRSTOR64,          //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxrstorscomma ( XRSTORS, )
//
// C prototype:
//  void dg_forthxrstorscomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XRSTORS instruction. This code sequence partially or fully restores
//   the processor state from the memory at the source that was previously set
//   up using one of the XSAVE instructions.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XRSTORS,           //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxrstors64comma ( XRSTORS64, )
//
// C prototype:
//  void dg_forthxrstors64comma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XRSTORS64 instruction. This code sequence partially or fully restores
//   the processor state from the memory at the source that was previously set
//   up using one of the XSAVE instructions.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XRSTORS,           //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsavecomma ( XSAVE, )
//
// C prototype:
//  void dg_forthxsavecomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVE instruction. This code sequence saves the processor state
//   to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVE,           //  it's kinda compilicated... please see your
//                            //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsave64comma ( XSAVE64, )
//
// C prototype:
//  void dg_forthxsave64comma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVE64 instruction. This code sequence saves the processor state
//   to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVE64,           //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsaveccomma ( XSAVEC, )
//
// C prototype:
//  void dg_forthxsaveccomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVEC instruction. This code sequence saves the processor state
//   with compaction to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVEC,          //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsavec64comma ( XSAVEC64, )
//
// C prototype:
//  void dg_forthxsavec64comma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVEC64 instruction. This code sequence saves the processor state
//   with compaction to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVEC64,          //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsaveoptcomma ( XSAVEOPT, )
//
// C prototype:
//  void dg_forthxsaveoptcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVEOPT instruction. This code sequence saves the processor state
//   with optimization if possible to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVEOPT,          //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsaveopt64comma ( XSAVEOPT64, )
//
// C prototype:
//  void dg_forthxsaveopt64comma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVEOPT64 instruction. This code sequence saves the processor state
//   with optimization if possible to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVEOPT64,        //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsavescomma ( XSAVES, )
//
// C prototype:
//  void dg_forthxsavescomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVES instruction. This code sequence saves the processor state
//   with compaction and with optimization if possible to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVES,        //  it's kinda compilicated... please see your
//                              //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxsaves64comma ( XSAVES64, )
//
// C prototype:
//  void dg_forthxsaves64comma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain these addressing mode
//   specifiers:
//
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  Description of target parameters:
//
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 XSAVES64 instruction. This code sequence saves the processor state
//   with compaction and with optimization if possible to memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX [R]  XSAVES64,        //  it's kinda compilicated... please see your
//                            //   processor's documentation
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdfsbasecomma ( RDFSBASE, )
//
// C prototype:
//  void dg_forthrdfsbasecomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  You can also do this:
//
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//
//   R                            specifies a register target.
//                                 R is optional.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 RDFSBASE instruction. This code sequence copies the FS segment
//   base address to the destination register. This instruction is only
//   supported in 64 bit mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX     RDFSBASE,        //  FSbaseaddress -> RAX
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdgsbasecomma ( RDGSBASE, )
//
// C prototype:
//  void dg_forthrdgsbasecomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  You can also do this:
//
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//
//   R                            specifies a register target.
//                                 R is optional.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 RDGSBASE instruction. This code sequence copies the GS segment
//   base address to the destination register. This instruction is only
//   supported in 64 bit mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX     GDFSBASE,        //  FSbaseaddress -> RAX
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
// WRFSBASE,
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthwrfsbasecomma ( WRFSBASE, )
//
// C prototype:
//  void dg_forthwrfsbasecomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  You can also do this:
//
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//
//   R                            specifies a register target.
//                                 R is optional.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 WRFSBASE instruction. This code sequence copies the source
//   register to the FS segment base address. This instruction is only
//   supported in 64 bit mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX     WRFSBASE,        //   RAX -> FSbaseaddress
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthwrgsbasecomma ( WRGSBASE, )
//
// C prototype:
//  void dg_forthwrgsbasecomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//
//  ( targetparameterlist -- )
//
// Data stack in:
//
//  targetparameterlist
//
//
//  The parameter list for a target can contain this addressing mode
//   specifier:
//
//   targetregister
//
//  You can also do this:
//
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               one of:
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//
//   R                            specifies a register target.
//                                 R is optional.
//
// Data stack out:
//  none
//
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 WRGSBASE instruction. This code sequence copies the source
//   to the GS segment base address. This instruction is only
//   supported in 64 bit mode.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RAX     WRGSBASE,        //  RAX -> GSbaseaddress
//
// Note:
//  Data size is not required for this instruction and is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpdepcomma ( PDEP, )
//
// C prototype:
//  void dg_forthpdepcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for targetyparameterlist can
//   contain this addressing mode specifiers:
//
//   targetregister
//
//  The parameter list for targetxparameterlist and targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PDEP instruction. This opcode sequence takes bits in order of low
//   to high from the source target, and deposits them into the destination at
//   the indexes from low to high where a bit is set in target y.
//   Destination bits that don't get a bit from the source are cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RAX [R]  EDX  ECX  PDEP,  // [RAX] spread out using EDX -> ECX
//
//   RAX [R]  RDX  RCX  PDEP,  // [RAX] spread out using RDX -> RCX
//
//   RAX  RDX  RCX  PDEP,      // RAX spread out using RDX -> RCX
//
//   RCX <  RDX  RAX  PDEP, // RAX spread out using RDX -> RCX
//
// Note:
//  Putting reverse after any target makes the first target pushed the
//   destination target, and the third target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpextcomma ( PEXT, )
//
// C prototype:
//  void dg_forthpextcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for targetyparameterlist can
//   contain this addressing mode specifiers:
//
//   targetregister
//
//  The parameter list for targetxparameterlist and targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a targetxparameterlist and
//   targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PEXT instruction. This opcode sequence takes bits in order of low
//   to high from the source target from the indexes where a bit is set in
//   target y, and deposits them in order from low to high into the destination.
//   Destination bits that don't get a bit from the source are cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples
//
//   RAX [R]  EDX  ECX  PEXT,  // [RAX] compacted using EDX -> ECX
//
//   RAX [R]  RDX  RCX  PEXT,  // [RAX] compacted using RDX -> RCX
//
//   RAX  RDX  RCX  PEXT,      // RAX compacted using RDX -> RCX
//
//   RCX <  RDX  RAX  PEXT, // RAX compacted using RDX -> RCX
//
// Note:
//  Putting reverse after any target makes the first target pushed the
//   destination target, and the third target pushed the source target.
//  Only one target can be a memory target. The destination must be a
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpextrbcomma ( PEXTRB, )
//
// C prototype:
//  void dg_forthpextrbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist xmmtargetparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  xmmtargetparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   8BIT
//
//  Alternative way to set the data size:
//   1 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 15 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PEXTRB instruction. This opcode sequence copies a byte from an
//   xmm register source and puts it into a regular register or memory
//   destination. The index of the byte copied is the immediate target's value.
//   The byte value copied gets zero extended to the current address mode size.
//   Because of the way the Intel docs were written... and to avoid confusion
//   I made this instruction ignore the size of the register unless you specify
//   the regular register with R. So in 64 bit address mode, the byte gets
//   zero extended to 64 bits. If you use R, then the register has to be an 8,
//   32 or 64 bit register. If you use R with a 64 bit register, then you will get
//   the rex.w prefix, however, the opcode sequence does the same thing.
//   At least it does on my icore3 :-)
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  PEXTRB,  //  XMM1[7:0]   -> [RAX][7:0]
//
//  1 N  XMM2  RAX [R]  PEXTRB,  //  XMM2[15:8]  -> [RAX][7:0]
//
//  3 N  XMM5  RCX     PEXTRB,   //  XMM5[31:24] -> RCX[7:0]
//                               //            0 -> RCX[127:8]
//
// Note:
//  If you explicitly declare the memory target size, it must be 8BIT. This is
//   not required for this compiling word.
//  Reverse not supported for this compiling word.
//  If you use R with an 8 bit register, it's treated like a 32 bit register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpextrbcomma ( VPEXTRB, )
//
// C prototype:
//  void dg_forthvpextrbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist xmmtargetparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  xmmtargetparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   8BIT
//
//  Alternative way to set the data size:
//   1 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 15 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPEXTRB instruction. This opcode sequence copies a byte from an
//   xmm register source and puts it into a regular register or memory
//   destination. The index of the byte copied is the immediate target's value.
//   If the destination target is a register, the byte value copied gets zero
//   extended to the current address mode size. If the destination is memory,
//   only one byte of the memory target is changed.
//   Because of the way the Intel docs were written, and to avoid confusion,
//   this instruction ignore the size of the destination register.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  VPEXTRB,  //  XMM1[7:0]   -> [RAX][7:0]
//
//  1 N  XMM2  RAX [R]  VPEXTRB,  //  XMM2[15:8]  -> [RAX][7:0]
//
//  3 N  XMM5  RCX     VPEXTRB,   //  XMM5[31:24] -> RCX[7:0]
//                                //            0 -> RCX[127:8]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpextrdcomma ( PEXTRD, )
//
// C prototype:
//  void dg_forthpextrdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist xmmtargetparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  xmmtargetparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   32BIT
//
//  Alternative way to set the data size:
//   4 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 3 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   32BIT                         sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PEXTRD instruction. This opcode sequence copies a 32 bit integer from
//   an xmm register source and puts it into a regular register or memory
//   destination. The index of the 32 bit integer copied is the immediate
//   target's value. The 32 bit integer value copied gets zero extended to
//   64 bits in 64 bit address mode.
//   This instruction ignores the size of the register unless you specify
//   the regular register with R. If you use R, then the register has to be an 8,
//   32 bit register.
//   If you don't use R, this means you can do something silly like specifying
//   AL as the destination register... but in reality it will be EAX, and the
//   upper 32 bits of RAX will get cleared.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  PEXTRD,  //  XMM1[31:0]   -> [RAX][31:0]
//
//  1 N  XMM2  RAX [R]  PEXTRD,  //  XMM2[63:32]  -> [RAX][31:0]
//
//  3 N  XMM5  RCX     PEXTRD,   //  XMM5[95:64] -> RCX[31:0]
//                               //            0 -> RCX[127:32]
//
// Note:
//  If you explicitly declare the memory target size, it must be 32BIT. This is
//   not required for this compiling word.
//  Reverse not supported for this compiling word.
//  If you use R with an 8 bit register, it's treated like a 32 bit register.
//  If you use R with a 64 bit register, then you will get the
//   the rex.w prefix, which will turn this instruction into a PEXTRQ....
//   It's like this so you can add the rex.w prefix with EXTRACTPS 3/30/2020
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpextrdcomma ( VPEXTRD, )
//
// C prototype:
//  void dg_forthvpextrdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist xmmtargetparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  xmmtargetparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   32BIT
//
//  Alternative way to set the data size:
//   4 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 3 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                         sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPEXTRD instruction. This opcode sequence copies a 32 bit integer from
//   an xmm register source and puts it into a regular register or memory
//   destination. The index of the 32 bit integer copied is the immediate
//   target's value. If the destination is a register, the 32 bit integer value
//   copied gets zero extended to 64 bits in 64 bit address mode. If the
//   destination is memory, only 32 bits of the memory target gets changed.
//   This instruction ignores the size of the destination register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  VPEXTRD,  //  XMM1[31:0]   -> [RAX][31:0]
//
//  1 N  XMM2  RAX [R]  VPEXTRD,  //  XMM2[63:32]  -> [RAX][31:0]
//
//  3 N  XMM5  RCX     VPEXTRD,   //  XMM5[95:64] -> RCX[31:0]
//                                //            0 -> RCX[127:32]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpextrqcomma ( PEXTRQ, )
//
// C prototype:
//  void dg_forthpextrqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist xmmtargetparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  xmmtargetparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   64BIT
//
//  Alternative way to set the data size:
//   8 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 or 1 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   64BIT                         sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PEXTRQ instruction. This opcode sequence copies a 64 bit integer from
//   an xmm register source and puts it into a regular register or memory
//   destination. The index of the 64 bit integer copied is the immediate
//   target's value.
//   This instruction ignores the size of the register unless you specify
//   the regular register with R. This means if you don't use R, you can do
//   something silly like specifying AL as the destination register ... but in
//   reality it will be RAX.

//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  PEXTRQ,  //  XMM1[63:0]   -> [RAX][63:0]
//
//  1 N  XMM2  RAX [R]  PEXTRQ,  //  XMM2[127:64] -> [RAX][63:0]
//
//  1 N  XMM5  RCX     PEXTRQ,   //  XMM5[127:64] -> RCX[63:0]
//
// Note:
//  If you explicitly declare the memory target size, it must be 64BIT. This is
//   not required for this compiling word.
//  Reverse not supported for this compiling word.
//  If you use R, the destination register has to be a 64 bit register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpextrqcomma ( VPEXTRQ, )
//
// C prototype:
//  void dg_forthvpextrqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist xmmtargetparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  xmmtargetparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   64BIT
//
//  Alternative way to set the data size:
//   8 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 or 1 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                         sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPEXTRQ instruction. This opcode sequence copies a 64 bit integer from
//   an xmm register source and puts it into a regular register or memory
//   destination. The index of the 64 bit integer copied is the immediate
//   target's value.
//   This instruction ignores the size of the destination register.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  VPEXTRQ,  //  XMM1[63:0]   -> [RAX][63:0]
//
//  1 N  XMM2  RAX [R]  VPEXTRQ,  //  XMM2[127:64] -> [RAX][63:0]
//
//  1 N  XMM5  RCX     VPEXTRQ,   //  XMM5[127:64] -> RCX[63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpextrwcomma ( PEXTRW, )
//
// C prototype:
//  void dg_forthpextrwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist targetxparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   16BIT
//
//  Alternative way to set the data size:
//   2 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 7 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   16BIT                         sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PEXTRW instruction. This opcode sequence copies a 16 bit integer from
//   the source to the destination.
//   If the source is a floating point register, the destination must be a regular
//   register. If the source is an xmm register, the destination can be an regular
//   register or memory.
//   The index of the 16 bit integer copied is the immediate target's value.
//   This instruction ignores the size of the register. This means you can do
//   something silly like make the destination AL, but the 16 bit value will
//   be zero extended to either 32 or 64 bits based on the address size
//   mode and then put into the register.
//   If the source is an XMM register and the destination is a regular register,
//   there are two encodings. If source is an XMM register and you use R for the
//   regular register, you get the same opcode as the XMM register to register or
//   memory opcode sequence. If the source is an XMM register and you do not use
//   R for the regular register, you get the XMM register to register opcode
//   sequence.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  PEXTRW,  //  XMM1[15:0]   -> [RAX][15:0]
//
//  1 N  XMM2  RAX [R]  PEXTRW,  //  XMM2[31:16]  -> [RAX][15:0]
//
//  3 N  XMM5  CX      PEXTRW,   //  XMM5[63:48]  -> RCX[15:0]
//                               //            0  -> RCX[127:0]
//
//  3 N  ST5  CX      PEXTRW,    //  ST5[63:48]    -> RCX[15:0]
//                               //           0    -> RCX[127:0]
//
//  8 N  XMM4  DX      PEXTRW,   //  XMM4[127:112] -> RDX[15:0]
//                               //              0 -> RDX[127:0]
//
// Note:
//  If you explicitly declare the memory target size, it must be 16BIT. This is
//   not required for this compiling word.
//  Reverse not supported for this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpextrwcomma ( VPEXTRW, )
//
// C prototype:
//  void dg_forthvpextrwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist targetxparameterlist rormtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  rormtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   16BIT
//
//  Alternative way to set the data size:
//   2 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 7 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   16BIT                         sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPEXTRW instruction. This opcode sequence copies a 16 bit integer from
//   the source to the destination.
//   The index of the 16 bit integer copied is the immediate target's value.
//   If the destination is a register target, then this instruction ignores the
//   size of the register and zero extends the result to the current address mode
//   size. If the destination is a memory target then only 16 bits of the
//   of the destination are changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  RAX [R]  VPEXTRW,  //  XMM1[15:0]   -> [RAX][15:0]
//
//  1 N  XMM2  RAX [R]  VPEXTRW,  //  XMM2[31:16]  -> [RAX][15:0]
//
//  3 N  XMM5  CX      VPEXTRW,   //  XMM5[63:48]  -> RCX[15:0]
//                                //            0  -> RCX[127:0]
//
//  3 N  ST5  CX       VPEXTRW,   //  ST5[63:48]    -> RCX[15:0]
//                                //           0    -> RCX[127:0]
//
//  8 N  XMM4  DX      VPEXTRW,   //  XMM4[127:112] -> RDX[15:0]
//                                //              0 -> RDX[127:0]
//
// Note:
//  If you explicitly declare the memory target size, it must be 16BIT. This is
//   not required for this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpextrw2comma ( VPEXTRW2, )
//
// C prototype:
//  void dg_forthvpextrw2comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the targetyparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 7 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//                                 specifying R for the destination forces
//                                 three byte vex encoding
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPEXTRW2 instruction. This opcode sequence copies a 16 bit integer from
//   the source to the destination.
//   The index of the 16 bit integer copied is the immediate target's value.
//   This instruction ignores the size of the destination register and zero extends
//   the result to the current address mode size.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//
//  3 N  XMM5  CX      VPEXTRW2,   //  XMM5[63:48]  -> RCX[15:0]
//                                 //            0  -> RCX[127:0]
//
//  3 N  ST5  CX       VPEXTRW2,   //  ST5[63:48]    -> RCX[15:0]
//                                 //           0    -> RCX[127:0]
//
//  8 N  XMM4  DX      VPEXTRW2,   //  XMM4[127:112] -> RDX[15:0]
//                                 //              0 -> RDX[127:0]
//
// Note:
//  If you use R to specify the destination register then the three byte form
//   of vex encoding is used even if the two byte form is possible.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpslldqcomma ( VPSLLDQ, )
//
// C prototype:
//  void dg_forthvpslldqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the targetxparameterlist or targetyparameterlist can contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 7 for this instruction)
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmm register target
//   YMMR                         specifies a ymm register target
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSLLDQ instruction. This opcode sequence shifts 128 bit values from
//   the source to the left count bytes and puts the result into the destination.
//   Zeroes are shifted in from the right. If the source and destination are ymm
//   registers, the lower 128 bits and upper 128 bits get shifted separately.
//   In other words, if you are using ymm registers, zereos are shifted in to
//   the upper 128 bits from the right also.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//
//  3 N  XMM5  XMM1  VPSLLDQ,  //  XMM5[127:0] << 8*count -> XMM1[127:0]
//
//  3 N  YMM5  YMM1  VPSLLDQ,  //  YMM5[127:0]   << 8*count
//                             //   -> YMM1[127:0]
//                             //  YMM5[255:128] << 8*count
//                             //   -> YMM1[255:128]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsrldqcomma ( VPSRLDQ, )
//
// C prototype:
//  void dg_forthvpsrldqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for the targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the targetxparameterlist or targetyparameterlist can contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 7 for this instruction)
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmm register target
//   YMMR                         specifies a ymm register target
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRLDQ instruction. This opcode sequence shifts 128 bit values from
//   the source to the right count bytes and puts the result into the destination.
//   Zeroes are shifted in from the left. If the source and destination are ymm
//   registers, the lower 128 bits and upper 128 bits get shifted separately.
//   In other words, if you are using ymm registers, zereos are shifted in to
//   the upper 128 bits from the left also.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//
//  3 N  XMM5  XMM1  VPSRLDQ,  //  XMM5[127:0] >> 8*count -> XMM1[127:0]
//
//  3 N  YMM5  YMM1  VPSRLDQ,  //  YMM5[127:0]   >> 8*count
//                             //   -> YMM1[127:0]
//                             //  YMM5[255:128] >> 8*count
//                             //   -> YMM1[255:128]
//
// Note:
//  Using XMMR or YMMR to specify the destination target forces 3 byte vex
//   encoding.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthphadddcomma ( PHADDD, )
//
// C prototype:
//  void dg_forthphadddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PHADDD instruction. This sequence adds pairs of 32 bit integers
//   from the destination and adds pairs of 32 bit integers from the source
//   and puts the 32 bit integer results into the destination alternating
//   between source and destination results. The lowest destination result
//   goes into the lowest 32 bits of the destination. The flags are not
//   changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PHADDD,    // XMM1[63:32]    + XMM1[31:0]     -> XMM1[31:0]
//                            // [RBX][63:32]   + [RBX][31:0]    -> XMM1[31:0]
//                            // XMM1[127:96]   + XMM1[95:64]    -> XMM1[95:64]
//                            // [RBX][127:96]  + [RBX][95:64]   -> XMM1[127:96]
//
//  RBX [R]  ST1  PHADDD,     // ST1[63:32]    + ST1[31:0]     -> ST1[31:0]
//                            // [RBX][63:32]  + [RBX][31:0]   -> ST1[63:32]
//
//  XMM2  XMM1  PHADDD,       // XMM1[63:32]    + XMM1[31:0]    -> XMM1[31:0]
//                            // XMM2[63:32]    + XMM2[31:0]    -> XMM1[31:0]
//                            // XMM1[127:96]   + XMM1[95:64]   -> XMM1[95:64]
//                            // XMM2[127:96]   + XMM2[95:64]   -> XMM1[127:96]
//
//  ST2  ST1  PHADDD,         // ST1[63:32]    + ST1[31:0]   -> ST1[31:0]
//                            // ST2[63:32]    + ST2[31:0]   -> ST1[63:32]
//
//  XMM1 <-  XMM2  PHADDD, // XMM1[63:32]    + XMM1[31:0]    -> XMM1[31:0]
//                            // XMM2[63:32]    + XMM2[31:0]    -> XMM1[31:0]
//                            // XMM1[127:96]   + XMM1[95:64]   -> XMM1[95:64]
//                            // XMM2[127:96]   + XMM2[95:64]   -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//   Signed and unsigned addition are the same when the sources and destination
//   are all the same size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvphadddcomma ( VPHADDD, )
//
// C prototype:
//  void dg_forthvphadddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPHADDD instruction. This opcode sequence adds adjacent pairs of 32
//   bit values in target y and adds adjacent pairs from source and puts the 
//   results into the destination. The two lowest 32 bit results come from the
//   two pairs of 32 bit values in the lower 128 bits of target y. The next two
//   32 bit results come from the two pairs of 32 bit values in the lower 128
//   bits of the source. The next two 32 bit results come for the two pairs of
//   32 bit values in the upper 128 bits of target y, and the last two 32 bit
//   results come from the two pairs of 32 bit values in the upper 128 bits of
//   the source. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPHADDD, // [RBX][31:0]   + [RBX][63:32] -> XMM0[31:0]
//                                // [RBX][127:96] + [RBX][95:64] -> XMM0[63:32]
//                                // XMM1[31:0]    + XMM1[63:32]  -> XMM0[95:64]
//                                // XMM1[127:96]  + XMM1[95:64]  -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VPHADDD,    // XMM2[31:0]    + XMM2[63:32] -> XMM0[31:0]
//                                // XMM2[127:96]  + XMM2[95:64] -> XMM0[63:32]
//                                // XMM1[31:0]    + XMM1[63:32] -> XMM0[95:64]
//                                // XMM1[127:96]  + XMM1[95:64] -> XMM0[127:96]
//
//  YMM2  YMM1  YMM0  VPHADDD,    // YMM2[31:0]    + YMM2[63:32]   -> YMM0[31:0]
//                                // YMM2[127:96]  + YMM2[95:64]   -> YMM0[63:32]
//                                // YMM1[31:0]    + YMM1[63:32]   -> YMM0[95:64]
//                                // YMM1[127:96]  + YMM1[95:64]   -> YMM0[127:96]
//                                // YMM2[159:128] + YMM2[191:160] -> YMM0[159:128]
//                                // YMM2[223:192] + YMM2[255:224] -> YMM0[191:160]
//                                // YMM1[159:128] + YMM1[191:160] -> YMM0[223:192]
//                                // YMM1[223:192] + YMM1[255:224] -> YMM0[255:224]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvphaddwcomma ( VPHADDW, )
//
// C prototype:
//  void dg_forthvphaddwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPHADDW instruction. This opcode sequence adds adjacent pairs of 16
//   bit values in target y and adds adjacent pairs from source and puts the 
//   results into the destination. The four lowest 16 bit results come from the
//   four pairs of 16 bit values in the lower 128 bits of target y. The next four
//   16 bit results come from the four pairs of 16 bit values in the lower 128
//   bits of the source. The next four 16 bit results come for the four pairs of
//   16 bit values in the upper 128 bits of target y, and the last four 16 bit
//   results come from the four pairs of 16 bit values in the upper 128 bits of
//   the source. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPHADDW, // [RBX][15:0]   + [RBX][31:16]   -> XMM0[15:0]
//                                // [RBX][47:32]  + [RBX][63:48]   -> XMM0[31:16]
//                                // [RBX][79:64]  + [RBX][95:80]   -> XMM0[47:32]
//                                // [RBX][111:96] + [RBX][127:112] -> XMM0[63:48]
//                                // XMM1[15:0]    + XMM1[31:16]    -> XMM0[79:64]
//                                // XMM1[47:32]   + XMM1[63:48]    -> XMM0[95:80]
//                                // XMM1[79:64]   + XMM1[95:80]    -> XMM0[111:96]
//                                // XMM1[111:96]  + XMM1[127:112]  -> XMM0[127:112]
//
//  XMM2  XMM1  XMM0  VPHADDW,    // XMM2[15:0]    + XMM2[31:16]    -> XMM0[15:0]
//                                // XMM2[47:32]   + XMM2[63:48]    -> XMM0[31:16]
//                                // XMM2[79:64]   + XMM2[95:80]    -> XMM0[47:32]
//                                // XMM2[111:96]  + XMM2[127:112]  -> XMM0[63:48]
//                                // XMM1[15:0]    + XMM1[31:16]    -> XMM0[79:64]
//                                // XMM1[47:32]   + XMM1[63:48]    -> XMM0[95:80]
//                                // XMM1[79:64]   + XMM1[95:80]    -> XMM0[111:96]
//                                // XMM1[111:96]  + XMM1[127:112]  -> XMM0[127:112]
//
//  YMM2  YMM1  YMM0  VPHADDW,    // YMM2[15:0]    + YMM2[31:16]    -> YMM0[15:0]
//                                // YMM2[47:32]   + YMM2[63:48]    -> YMM0[31:16]
//                                // YMM2[79:64]   + YMM2[95:80]    -> YMM0[47:32]
//                                // YMM2[111:96]  + YMM2[127:112]  -> YMM0[63:48]
//                                // YMM1[15:0]    + YMM1[31:16]    -> YMM0[79:64]
//                                // YMM1[47:32]   + YMM1[63:48]    -> YMM0[95:80]
//                                // YMM1[79:64]   + YMM1[95:80]    -> YMM0[111:96]
//                                // YMM1[111:96]  + YMM1[127:112]  -> YMM0[127:112]
//                                // YMM2[143:128] + YMM2[159:144]  -> YMM0[143:128]
//                                // YMM2[175:160] + YMM2[191:176]  -> YMM0[159:144]
//                                // YMM2[207:192] + YMM2[223:208]  -> YMM0[175:160]
//                                // YMM2[239:224] + YMM2[255:240]  -> YMM0[191:176]
//                                // YMM1[143:128] + YMM1[159:144]  -> YMM0[207:192]
//                                // YMM1[175:160] + YMM1[191:176]  -> YMM0[223:208]
//                                // YMM1[207:192] + YMM1[223:208]  -> YMM0[239:224]
//                                // YMM1[239:224] + YMM1[255:240]  -> YMM0[255:240]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvphaddswcomma ( VPHADDSW, )
//
// C prototype:
//  void dg_forthvphaddswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPHADDSW instruction. This opcode sequence adds adjacent pairs of
//   signed 16 bit values in target y and adds adjacent pairs of signed 16 bit 
//   values from source, then if the results are more or less than what will fit 
//   into signed 16 bit values it limits the results to the most positive or
//   negative 16 bit values possible and puts the results into the destination. 
//   The four lowest 16 bit results come from the
//   four pairs of 16 bit values in the lower 128 bits of target y. The next four
//   16 bit results come from the four pairs of 16 bit values in the lower 128
//   bits of the source. The next four 16 bit results come for the four pairs of
//   16 bit values in the upper 128 bits of target y, and the last four 16 bit
//   results come from the four pairs of 16 bit values in the upper 128 bits of
//   the source. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPHADDSW, // [RBX][15:0]   + [RBX][31:16]   -> XMM0[15:0]
//                                 // [RBX][47:32]  + [RBX][63:48]   -> XMM0[31:16]
//                                 // [RBX][79:64]  + [RBX][95:80]   -> XMM0[47:32]
//                                 // [RBX][111:96] + [RBX][127:112] -> XMM0[63:48]
//                                 // XMM1[15:0]    + XMM1[31:16]    -> XMM0[79:64]
//                                 // XMM1[47:32]   + XMM1[63:48]    -> XMM0[95:80]
//                                 // XMM1[79:64]   + XMM1[95:80]    -> XMM0[111:96]
//                                 // XMM1[111:96]  + XMM1[127:112]  -> XMM0[127:112]
//
//  XMM2  XMM1  XMM0  VPHADDSW,   // XMM2[15:0]    + XMM2[31:16]    -> XMM0[15:0]
//                                // XMM2[47:32]   + XMM2[63:48]    -> XMM0[31:16]
//                                // XMM2[79:64]   + XMM2[95:80]    -> XMM0[47:32]
//                                // XMM2[111:96]  + XMM2[127:112]  -> XMM0[63:48]
//                                // XMM1[15:0]    + XMM1[31:16]    -> XMM0[79:64]
//                                // XMM1[47:32]   + XMM1[63:48]    -> XMM0[95:80]
//                                // XMM1[79:64]   + XMM1[95:80]    -> XMM0[111:96]
//                                // XMM1[111:96]  + XMM1[127:112]  -> XMM0[127:112]
//
//  YMM2  YMM1  YMM0  VPHADDSW,   // YMM2[15:0]    + YMM2[31:16]    -> YMM0[15:0]
//                                // YMM2[47:32]   + YMM2[63:48]    -> YMM0[31:16]
//                                // YMM2[79:64]   + YMM2[95:80]    -> YMM0[47:32]
//                                // YMM2[111:96]  + YMM2[127:112]  -> YMM0[63:48]
//                                // YMM1[15:0]    + YMM1[31:16]    -> YMM0[79:64]
//                                // YMM1[47:32]   + YMM1[63:48]    -> YMM0[95:80]
//                                // YMM1[79:64]   + YMM1[95:80]    -> YMM0[111:96]
//                                // YMM1[111:96]  + YMM1[127:112]  -> YMM0[127:112]
//                                // YMM2[143:128] + YMM2[159:144]  -> YMM0[143:128]
//                                // YMM2[175:160] + YMM2[191:176]  -> YMM0[159:144]
//                                // YMM2[207:192] + YMM2[223:208]  -> YMM0[175:160]
//                                // YMM2[239:224] + YMM2[255:240]  -> YMM0[191:176]
//                                // YMM1[143:128] + YMM1[159:144]  -> YMM0[207:192]
//                                // YMM1[175:160] + YMM1[191:176]  -> YMM0[223:208]
//                                // YMM1[207:192] + YMM1[223:208]  -> YMM0[239:224]
//                                // YMM1[239:224] + YMM1[255:240]  -> YMM0[255:240]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvphsubdcomma ( VPHSUBD, )
//
// C prototype:
//  void dg_forthvphsubdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPHSUBD instruction. This opcode sequence subtracts adjacent pairs 
//   of 32 bit values in target y and subtracts adjacent pairs from source and 
//   puts the results into the destination. The two lowest 32 bit results come 
//   from the two pairs of 32 bit values in the lower 128 bits of target y. The 
//   next two 32 bit results come from the two pairs of 32 bit values in the 
//   lower 128 bits of the source. The next two 32 bit results come for the two 
//   pairs of 32 bit values in the upper 128 bits of target y, and the last two 
//   32 bit results come from the two pairs of 32 bit values in the upper 128 
//   bits of the source. The higher indexed value is subtracted from the lower
//   indexed value.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPHSUBD, // [RBX][31:0]   - [RBX][63:32] -> XMM0[31:0]
//                                // [RBX][127:96] - [RBX][95:64] -> XMM0[63:32]
//                                // XMM1[31:0]    - XMM1[63:32]  -> XMM0[95:64]
//                                // XMM1[127:96]  - XMM1[95:64]  -> XMM0[127:96]
//
//  XMM2  XMM1  XMM0  VPHSUBD,    // XMM2[31:0]    - XMM2[63:32] -> XMM0[31:0]
//                                // XMM2[127:96]  - XMM2[95:64] -> XMM0[63:32]
//                                // XMM1[31:0]    - XMM1[63:32] -> XMM0[95:64]
//                                // XMM1[127:96]  - XMM1[95:64] -> XMM0[127:96]
//
//  YMM2  YMM1  YMM0  VPHSUBD,    // YMM2[31:0]    - YMM2[63:32]   -> YMM0[31:0]
//                                // YMM2[127:96]  - YMM2[95:64]   -> YMM0[63:32]
//                                // YMM1[31:0]    - YMM1[63:32]   -> YMM0[95:64]
//                                // YMM1[127:96]  - YMM1[95:64]   -> YMM0[127:96]
//                                // YMM2[159:128] - YMM2[191:160] -> YMM0[159:128]
//                                // YMM2[223:192] - YMM2[255:224] -> YMM0[191:160]
//                                // YMM1[159:128] - YMM1[191:160] -> YMM0[223:192]
//                                // YMM1[223:192] - YMM1[255:224] -> YMM0[255:224]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvphsubwcomma ( VPHSUBW, )
//
// C prototype:
//  void dg_forthvphsubwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPHSUBW instruction. This opcode sequence subtracts adjacent pairs 
//   of 16 bit values in target y and subtracts adjacent pairs from source and 
//   puts the results into the destination. The four lowest 16 bit results come 
//   from the four pairs of 16 bit values in the lower 128 bits of target y. The 
//   next four 16 bit results come from the four pairs of 16 bit values in the 
//   lower 128 bits of the source. The next four 16 bit results come for the 
//   four pairs of 16 bit values in the upper 128 bits of target y, and the last 
//   four 16 bit results come from the four pairs of 16 bit values in the upper 
//   128 bits of the source. The higher index value of each pair is subtracted
//   from the lower indexed value.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPHSUBW, // [RBX][15:0]   - [RBX][31:16]   -> XMM0[15:0]
//                                // [RBX][47:32]  - [RBX][63:48]   -> XMM0[31:16]
//                                // [RBX][79:64]  - [RBX][95:80]   -> XMM0[47:32]
//                                // [RBX][111:96] - [RBX][127:112] -> XMM0[63:48]
//                                // XMM1[15:0]    - XMM1[31:16]    -> XMM0[79:64]
//                                // XMM1[47:32]   - XMM1[63:48]    -> XMM0[95:80]
//                                // XMM1[79:64]   - XMM1[95:80]    -> XMM0[111:96]
//                                // XMM1[111:96]  - XMM1[127:112]  -> XMM0[127:112]
//
//  XMM2  XMM1  XMM0  VPHSUBW,    // XMM2[15:0]    - XMM2[31:16]    -> XMM0[15:0]
//                                // XMM2[47:32]   - XMM2[63:48]    -> XMM0[31:16]
//                                // XMM2[79:64]   - XMM2[95:80]    -> XMM0[47:32]
//                                // XMM2[111:96]  - XMM2[127:112]  -> XMM0[63:48]
//                                // XMM1[15:0]    - XMM1[31:16]    -> XMM0[79:64]
//                                // XMM1[47:32]   - XMM1[63:48]    -> XMM0[95:80]
//                                // XMM1[79:64]   - XMM1[95:80]    -> XMM0[111:96]
//                                // XMM1[111:96]  - XMM1[127:112]  -> XMM0[127:112]
//
//  YMM2  YMM1  YMM0  VPHSUBW,    // YMM2[15:0]    - YMM2[31:16]    -> YMM0[15:0]
//                                // YMM2[47:32]   - YMM2[63:48]    -> YMM0[31:16]
//                                // YMM2[79:64]   - YMM2[95:80]    -> YMM0[47:32]
//                                // YMM2[111:96]  - YMM2[127:112]  -> YMM0[63:48]
//                                // YMM1[15:0]    - YMM1[31:16]    -> YMM0[79:64]
//                                // YMM1[47:32]   - YMM1[63:48]    -> YMM0[95:80]
//                                // YMM1[79:64]   - YMM1[95:80]    -> YMM0[111:96]
//                                // YMM1[111:96]  - YMM1[127:112]  -> YMM0[127:112]
//                                // YMM2[143:128] - YMM2[159:144]  -> YMM0[143:128]
//                                // YMM2[175:160] - YMM2[191:176]  -> YMM0[159:144]
//                                // YMM2[207:192] - YMM2[223:208]  -> YMM0[175:160]
//                                // YMM2[239:224] - YMM2[255:240]  -> YMM0[191:176]
//                                // YMM1[143:128] - YMM1[159:144]  -> YMM0[207:192]
//                                // YMM1[175:160] - YMM1[191:176]  -> YMM0[223:208]
//                                // YMM1[207:192] - YMM1[223:208]  -> YMM0[239:224]
//                                // YMM1[239:224] - YMM1[255:240]  -> YMM0[255:240]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvphsubswcomma ( VPHSUBSW, )
//
// C prototype:
//  void dg_forthvphsubswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPHSUBSW instruction. This opcode sequence subtracts adjacent pairs 
//   of signed 16 bit values in target y and subtracts adjacent pairs of signed 
//   16 bit values from source, then if the results are more or less than what 
//   will fit into signed 16 bit values it limits the results to the most 
//   positive or negative 16 bit values possible and puts the results into the 
//   destination. 
//  The four lowest 16 bit results come from the four pairs of 16 bit values in 
//   the lower 128 bits of target y. The next four 16 bit results come from the 
//   four pairs of 16 bit values in the lower 128 bits of the source. The next 
//   four 16 bit results come for the four pairs of 16 bit values in the upper 
//   128 bits of target y, and the last four 16 bit results come from the four 
//   pairs of 16 bit values in the upper 128 bits of the source. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPHSUBSW, // [RBX][15:0]   - [RBX][31:16]   -> XMM0[15:0]
//                                 // [RBX][47:32]  - [RBX][63:48]   -> XMM0[31:16]
//                                 // [RBX][79:64]  - [RBX][95:80]   -> XMM0[47:32]
//                                 // [RBX][111:96] - [RBX][127:112] -> XMM0[63:48]
//                                 // XMM1[15:0]    - XMM1[31:16]    -> XMM0[79:64]
//                                 // XMM1[47:32]   - XMM1[63:48]    -> XMM0[95:80]
//                                 // XMM1[79:64]   - XMM1[95:80]    -> XMM0[111:96]
//                                 // XMM1[111:96]  - XMM1[127:112]  -> XMM0[127:112]
//
//  XMM2  XMM1  XMM0  VPHSUBSW,   // XMM2[15:0]    - XMM2[31:16]    -> XMM0[15:0]
//                                // XMM2[47:32]   - XMM2[63:48]    -> XMM0[31:16]
//                                // XMM2[79:64]   - XMM2[95:80]    -> XMM0[47:32]
//                                // XMM2[111:96]  - XMM2[127:112]  -> XMM0[63:48]
//                                // XMM1[15:0]    - XMM1[31:16]    -> XMM0[79:64]
//                                // XMM1[47:32]   - XMM1[63:48]    -> XMM0[95:80]
//                                // XMM1[79:64]   - XMM1[95:80]    -> XMM0[111:96]
//                                // XMM1[111:96]  - XMM1[127:112]  -> XMM0[127:112]
//
//  YMM2  YMM1  YMM0  VPHSUBSW,   // YMM2[15:0]    - YMM2[31:16]    -> YMM0[15:0]
//                                // YMM2[47:32]   - YMM2[63:48]    -> YMM0[31:16]
//                                // YMM2[79:64]   - YMM2[95:80]    -> YMM0[47:32]
//                                // YMM2[111:96]  - YMM2[127:112]  -> YMM0[63:48]
//                                // YMM1[15:0]    - YMM1[31:16]    -> YMM0[79:64]
//                                // YMM1[47:32]   - YMM1[63:48]    -> YMM0[95:80]
//                                // YMM1[79:64]   - YMM1[95:80]    -> YMM0[111:96]
//                                // YMM1[111:96]  - YMM1[127:112]  -> YMM0[127:112]
//                                // YMM2[143:128] - YMM2[159:144]  -> YMM0[143:128]
//                                // YMM2[175:160] - YMM2[191:176]  -> YMM0[159:144]
//                                // YMM2[207:192] - YMM2[223:208]  -> YMM0[175:160]
//                                // YMM2[239:224] - YMM2[255:240]  -> YMM0[191:176]
//                                // YMM1[143:128] - YMM1[159:144]  -> YMM0[207:192]
//                                // YMM1[175:160] - YMM1[191:176]  -> YMM0[223:208]
//                                // YMM1[207:192] - YMM1[223:208]  -> YMM0[239:224]
//                                // YMM1[239:224] - YMM1[255:240]  -> YMM0[255:240]
//
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthphaddwcomma ( PHADDW, )
//
// C prototype:
//  void dg_forthphaddwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PHADDW instruction. This sequence adds pairs of 16 bit integers
//   from the destination and adds pairs of 16 bit integers from the source
//   and puts the 16 bit integer results into the destination alternating
//   between source and destination results. The lowest destination result
//   goes into the lowest 16 bits of the destination. The flags are not
//   changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PHADDW,    // XMM1[31:16]    + XMM1[15:0]     -> XMM1[15:0]
//                            // [RBX][31:16]   + [RBX][15:0]    -> XMM1[31:16]
//                            // XMM1[63:48]    + XMM1[47:32]    -> XMM1[47:32]
//                            // [RBX][63:48]   + [RBX][47:32]   -> XMM1[63:48]
//                            // XMM1[95:80]    + XMM1[79:63]    -> XMM1[79:63]
//                            // [RBX][95:80]   + [RBX][79:63]   -> XMM1[95:80]
//                            // XMM1[127:112]  + XMM1[111:96]   -> XMM1[111:96]
//                            // [RBX][127:112] + [RBX][111:96]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PHADDW,     // ST1[31:16]    + ST1[15:0]     -> ST1[15:0]
//                            // [RBX][31:16]  + [RBX][15:0]   -> ST1[31:16]
//                            // ST1[63:48]    + ST1[47:32]    -> ST1[47:32]
//                            // [RBX][63:48]  + [RBX][47:32]  -> ST1[63:48]
//
//  XMM2  XMM1  PHADDW,       // XMM1[31:16]   + XMM1[15:0]    -> XMM1[15:0]
//                            // XMM2[31:16]   + XMM2[15:0]    -> XMM1[31:16]
//                            // XMM1[63:48]   + XMM1[47:32]   -> XMM1[47:32]
//                            // XMM2[63:48]   + XMM2[47:32]   -> XMM1[63:48]
//                            // XMM1[95:80]   + XMM1[79:63]   -> XMM1[79:63]
//                            // XMM2[95:80]   + XMM2[79:63]   -> XMM1[95:80]
//                            // XMM1[127:112] + XMM1[111:96]  -> XMM1[111:96]
//                            // XMM2[127:112] + XMM2[111:96]  -> XMM1[127:112]
//
//  ST2  ST1  PHADDW,         // ST1[31:16]  + ST1[15:0]   -> ST1[15:0]
//                            // ST2[31:16]  + ST2[15:0]   -> ST1[31:16]
//                            // ST1[63:48]  + ST1[47:32]  -> ST1[47:32]
//                            // ST2[63:48]  + ST2[47:32]  -> ST1[63:48]
//
//  XMM1 <-  XMM2  PHADDW, // XMM1[31:16]   + XMM1[15:0]    -> XMM1[15:0]
//                            // XMM2[31:16]   + XMM2[15:0]    -> XMM1[31:16]
//                            // XMM1[63:48]   + XMM1[47:32]   -> XMM1[47:32]
//                            // XMM2[63:48]   + XMM2[47:32]   -> XMM1[63:48]
//                            // XMM1[95:80]   + XMM1[79:63]   -> XMM1[79:63]
//                            // XMM2[95:80]   + XMM2[79:63]   -> XMM1[95:80]
//                            // XMM1[127:112] + XMM1[111:96]  -> XMM1[111:96]
//                            // XMM2[127:112] + XMM2[111:96]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//   Signed and unsigned addition are the same when the sources and destination
//   are all the same size. (It's the carry and overflow flags that help you tell
//   the difference.)
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthphaddswcomma ( PHADDSW, )
//
// C prototype:
//  void dg_forthphaddswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PHADDSW instruction. This sequence adds pairs of signed 16 bit
//   integers from the destination and adds pairs of signed 16 bit integers
//   from the source and puts the 16 bit integer results into the destination
//   alternating between source and destination results. If the result is
//   more than what will fit into a 16 bit signed integer, the result is
//   clipped to the maximum or minimum signed 16 bit value possible.
//   The lowest destination result goes into the lowest 16 bits of the
//   destination. The flags are not changed.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PHADDSW,    // XMM1[31:16]    + XMM1[15:0]     -> XMM1[15:0]
//                             // [RBX][31:16]   + [RBX][15:0]    -> XMM1[31:16]
//                             // XMM1[63:48]    + XMM1[47:32]    -> XMM1[47:32]
//                             // [RBX][63:48]   + [RBX][47:32]   -> XMM1[63:48]
//                             // XMM1[95:80]    + XMM1[79:63]    -> XMM1[79:63]
//                             // [RBX][95:80]   + [RBX][79:63]   -> XMM1[95:80]
//                             // XMM1[127:112]  + XMM1[111:96]   -> XMM1[111:96]
//                             // [RBX][127:112] + [RBX][111:96]  -> XMM1[127:112]
//
//  RBX [R]  ST1  PHADDSW,     // ST1[31:16]    + ST1[15:0]     -> ST1[15:0]
//                             // [RBX][31:16]  + [RBX][15:0]   -> ST1[31:16]
//                             // ST1[63:48]    + ST1[47:32]    -> ST1[47:32]
//                             // [RBX][63:48]  + [RBX][47:32]  -> ST1[63:48]
//
//  XMM2  XMM1  PHADDSW,       // XMM1[31:16]   + XMM1[15:0]    -> XMM1[15:0]
//                             // XMM2[31:16]   + XMM2[15:0]    -> XMM1[31:16]
//                             // XMM1[63:48]   + XMM1[47:32]   -> XMM1[47:32]
//                             // XMM2[63:48]   + XMM2[47:32]   -> XMM1[63:48]
//                             // XMM1[95:80]   + XMM1[79:63]   -> XMM1[79:63]
//                             // XMM2[95:80]   + XMM2[79:63]   -> XMM1[95:80]
//                             // XMM1[127:112] + XMM1[111:96]  -> XMM1[111:96]
//                             // XMM2[127:112] + XMM2[111:96]  -> XMM1[127:112]
//
//  ST2  ST1  PHADDSW,         // ST1[31:16]  + ST1[15:0]   -> ST1[15:0]
//                             // ST2[31:16]  + ST2[15:0]   -> ST1[31:16]
//                             // ST1[63:48]  + ST1[47:32]  -> ST1[47:32]
//                             // ST2[63:48]  + ST2[47:32]  -> ST1[63:48]
//
//  XMM1 <-  XMM2  PHADDSW, // XMM1[31:16]   + XMM1[15:0]    -> XMM1[15:0]
//                             // XMM2[31:16]   + XMM2[15:0]    -> XMM1[31:16]
//                             // XMM1[63:48]   + XMM1[47:32]   -> XMM1[47:32]
//                             // XMM2[63:48]   + XMM2[47:32]   -> XMM1[63:48]
//                             // XMM1[95:80]   + XMM1[79:63]   -> XMM1[79:63]
//                             // XMM2[95:80]   + XMM2[79:63]   -> XMM1[95:80]
//                             // XMM1[127:112] + XMM1[111:96]  -> XMM1[111:96]
//                             // XMM2[127:112] + XMM2[111:96]  -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//   Signed and unsigned addition are the same when the sources and destination
//   are all the same size.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthphsubdcomma ( PHSUBD, )
//
// C prototype:
//  void dg_forthphsubdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PHSUBD instruction. This sequence subtrats pairs of 32 bit integers
//   from the destination and subtracts pairs of 32 bit integers from the source
//   and puts the 32 bit integer results into the destination alternating
//   between source and destination results. The lowest destination result
//   goes into the lowest 32 bits of the destination. The flags are not
//   changed. The higher position 32 bit integer of the pair is subtracted
//   from the lower position 32 bit integer.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PHSUBD,    // XMM[31:0]     - XMM1[63:32]     -> XMM1[31:0]
//                            // [RBX][31:0]   - [RBX][63:32]    -> XMM1[31:0]
//                            // XMM1[95:64]   - XMM1[127:96]    -> XMM1[95:64]
//                            // [RBX][95:64]  - [RBX][127:96]   -> XMM1[127:96]
//
//  RBX [R]  ST1  PHSUBD,     // ST1[31:0]    - ST1[63:32]     -> ST1[31:0]
//                            // [RBX][31:0]  - [RBX][63:32]   -> ST1[63:32]
//
//  XMM2  XMM1  PHSUBD,       // XMM1[31:0]    - XMM1[63:32]    -> XMM1[31:0]
//                            // XMM2[31:0]    - XMM2[63:32]    -> XMM1[31:0]
//                            // XMM1[95:64]   - XMM1[127:96]   -> XMM1[95:64]
//                            // XMM2[95:64]   - XMM2[127:96]   -> XMM1[127:96]
//
//  ST2  ST1  PHSUBD,         // ST1[31:0]    - ST1[63:32]   -> ST1[31:0]
//                            // ST2[31:0]    - ST2[63:32]   -> ST1[63:32]
//
//  XMM1 <-  XMM2  PHSUBD, // XMM1[31:0]    - XMM1[63:32]    -> XMM1[31:0]
//                            // XMM2[31:0]    - XMM2[63:32]    -> XMM1[31:0]
//                            // XMM1[95:64]   - XMM1[127:96]   -> XMM1[95:64]
//                            // XMM2[95:64]   - XMM2[127:96]   -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthphsubwcomma ( PHSUBW, )
//
// C prototype:
//  void dg_forthphsubwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PHSUBW instruction. This sequence subtracts pairs of signed 16 bit
//   integers from the destination and subtracts pairs of signed 16 bit integers
//   from the source and puts the signed 16 bit integer results into the
//   destination alternating between source and destination results.
//   The lowest destination result goes into the lowest 16 bits of the destination.
//   The flags are not changed. The higher position 16 bit integer of the pair is
//   subtracted from the lower position 16 bit integer.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PHSUBW,    // XMM1[15:0]    - XMM1[31:16]    -> XMM1[15:0]
//                            // [RBX][15:0]   - [RBX][31:16]   -> XMM1[31:16]
//                            // XMM1[47:32]   - XMM1[63:48]    -> XMM1[47:32]
//                            // [RBX][47:32]  - [RBX][63:48]   -> XMM1[63:48]
//                            // XMM1[79:63]   - XMM1[95:80]    -> XMM1[79:63]
//                            // [RBX][79:63]  - [RBX][95:80]   -> XMM1[95:80]
//                            // XMM1[111:96]  - XMM1[127:112]  -> XMM1[111:96]
//                            // [RBX][111:96] - [RBX][127:112] -> XMM1[127:112]
//
//  RBX [R]  ST1  PHSUBW,     // ST1[15:0]    - ST1[31:16]    -> ST1[15:0]
//                            // [RBX][15:0]  - [RBX][31:16]  -> ST1[31:16]
//                            // ST1[63:48]   - ST1[47:32]    -> ST1[47:32]
//                            // [RBX][63:48] - [RBX][47:32]  -> ST1[63:48]
//
//  XMM2  XMM1  PHSUBW,       // XMM1[15:0]   - XMM1[31:16]   -> XMM1[15:0]
//                            // XMM2[15:0]   - XMM2[31:16]   -> XMM1[31:16]
//                            // XMM1[47:32]  - XMM1[63:48]   -> XMM1[47:32]
//                            // XMM2[47:32   - XMM2[63:48]   -> XMM1[63:48]
//                            // XMM1[79:63   - XMM1[95:80]   -> XMM1[79:63]
//                            // XMM2[79:63]  - XMM2[95:80]   -> XMM1[95:80]
//                            // XMM1[111:96] - XMM1[127:112] -> XMM1[111:96]
//                            // XMM2[111:96] - XMM2[127:112] -> XMM1[127:112]
//
//  ST2  ST1  PHSUBW,         // ST1[15:0]  - ST1[31:16]  -> ST1[15:0]
//                            // ST2[15:0]  - ST2[31:16]  -> ST1[31:16]
//                            // ST1[47:32  - ST1[63:48]  -> ST1[47:32]
//                            // ST2[47:32] - ST2[63:48]  -> ST1[63:48]
//
//  XMM1 <-  XMM2  PHSUBW, // XMM1[15:0]   - XMM1[31:16]   -> XMM1[15:0]
//                            // XMM2[15:0]   - XMM2[31:16]   -> XMM1[31:16]
//                            // XMM1[47:32]  - XMM1[63:48]   -> XMM1[47:32]
//                            // XMM2[47:32]  - XMM2[63:48]   -> XMM1[63:48]
//                            // XMM1[79:63]  - XMM1[95:80]   -> XMM1[79:63]
//                            // XMM2[79:63]  - XMM2[95:80]   -> XMM1[95:80]
//                            // XMM1[111:96] - XMM1[127:112] -> XMM1[111:96]
//                            // XMM2[111:96] - XMM2[127:112] -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthphsubswcomma ( PHSUBSW, )
//
// C prototype:
//  void dg_forthphsubswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PHSUBSW instruction. This sequence subtracts pairs of signed 16 bit
//   integers from the destination and subtracts pairs of signed 16 bit integers
//   from the source and puts the signed 16 bit integer results into the
//   destination alternating between source and destination results. If the
//   result of a subtraction is more than what will fit into a signed 16 bit
//   integer, the result is clipped to the maximum or minum 16 bit value possible.
//   The lowest destination result goes into the lowest 16 bits of the destination.
//   The flags are not changed. The higher position 16 bit integer of the pair is
//   subtracted from the lower position 16 bit integer.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PHSUBSW,    // XMM1[15:0]    - XMM1[31:16]    -> XMM1[15:0]
//                             // [RBX][15:0]   - [RBX][31:16]   -> XMM1[31:16]
//                             // XMM1[47:32]   - XMM1[63:48]    -> XMM1[47:32]
//                             // [RBX][47:32]  - [RBX][63:48]   -> XMM1[63:48]
//                             // XMM1[79:63]   - XMM1[95:80]    -> XMM1[79:63]
//                             // [RBX][79:63]  - [RBX][95:80]   -> XMM1[95:80]
//                             // XMM1[111:96]  - XMM1[127:112]  -> XMM1[111:96]
//                             // [RBX][111:96] - [RBX][127:112] -> XMM1[127:112]
//
//  RBX [R]  ST1  PHSUBSW,     // ST1[15:0]    - ST1[31:16]    -> ST1[15:0]
//                             // [RBX][15:0]  - [RBX][31:16]  -> ST1[31:16]
//                             // ST1[63:48]   - ST1[47:32]    -> ST1[47:32]
//                             // [RBX][63:48] - [RBX][47:32]  -> ST1[63:48]
//
//  XMM2  XMM1  PHSUBSW,      // XMM1[15:0]   - XMM1[31:16]   -> XMM1[15:0]
//                            // XMM2[15:0]   - XMM2[31:16]   -> XMM1[31:16]
//                            // XMM1[47:32]  - XMM1[63:48]   -> XMM1[47:32]
//                            // XMM2[47:32   - XMM2[63:48]   -> XMM1[63:48]
//                            // XMM1[79:63   - XMM1[95:80]   -> XMM1[79:63]
//                            // XMM2[79:63]  - XMM2[95:80]   -> XMM1[95:80]
//                            // XMM1[111:96] - XMM1[127:112] -> XMM1[111:96]
//                            // XMM2[111:96] - XMM2[127:112] -> XMM1[127:112]
//
//  ST2  ST1  PHSUBSW,        // ST1[15:0]  - ST1[31:16]  -> ST1[15:0]
//                            // ST2[15:0]  - ST2[31:16]  -> ST1[31:16]
//                            // ST1[47:32  - ST1[63:48]  -> ST1[47:32]
//                            // ST2[47:32] - ST2[63:48]  -> ST1[63:48]
//
//  XMM1 <-  XMM2  PHSUBSW, // XMM1[15:0]   - XMM1[31:16]   -> XMM1[15:0]
//                             // XMM2[15:0]   - XMM2[31:16]   -> XMM1[31:16]
//                             // XMM1[47:32]  - XMM1[63:48]   -> XMM1[47:32]
//                             // XMM2[47:32]  - XMM2[63:48]   -> XMM1[63:48]
//                             // XMM1[79:63]  - XMM1[95:80]   -> XMM1[79:63]
//                             // XMM2[79:63]  - XMM2[95:80]   -> XMM1[95:80]
//                             // XMM1[111:96] - XMM1[127:112] -> XMM1[111:96]
//                             // XMM2[111:96] - XMM2[127:112] -> XMM1[127:112]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpinsrbcomma ( PINSRB, )
//
// C prototype:
//  void dg_forthpinsrbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist xmmtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  xmmtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   8BIT
//
//  Alternative way to set the data size:
//   1 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 15 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PINRRB instruction. This opcode sequence copies a byte from a
//   regular register or memory source and puts it into an xmm register
//   destination. The index of the byte copied is the immediate target's value.
//   Source register size or memory data size is ignored for this compiling word.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM1  PINSRB,  //  [RAX][7:0] -> XMM1[7:0]
//
//  1 N  RAX[R]  XMM2  PINSRB,  //  [RAX][7:0] -> XMM2[15:8]
//
//  3 N  CL  XMM5  PINSRB,      //  CL[7:0]    -> XMM5[31:24]
//
// Note:
//  Reverse not supported for this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpinsrbcomma ( VPINSRB, )
//
// C prototype:
//  void dg_forthvpinsrbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist xmmtargetparameterlist xmmtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  xmmtargetparameterlist
//  xmmtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   8BIT
//
//  Alternative way to set the data size:
//   1 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 15 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPINRRB instruction. This opcode sequence copies a byte from a
//   regular register or memory source, overwrites a byte in an xmm register
//   second source at the index from the immediate target's value and then
//   puts the result into another xmm register destination.
//   Source register size is ignored for this compiling word.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM2  XMM1   VPINSRB,  //  [RAX][7:0]   -> XMM1[7:0]
//                                      //  XMM2[127:8]  -> XMM1[127:8]
//
//  1 N  RAX[R]  XMM1  XMM2  VPINSRB,   //  [RAX][7:0]   -> XMM2[15:8]
//                                      //  XMM1[7:0]    -> XMM2[7:0]
//                                      //  XMM1[127:16] -> XMM2[127:16]
//
//  3 N  CL  XMM3  XMM5  VPINSRB,       //  CL[7:0]      -> XMM5[31:24]
//                                      //  XMM1[23:0]   -> XMM5[23:0]
//                                      //  XMM1[127:32] -> XMM5[127:32]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpinsrdcomma ( PINSRD, )
//
// C prototype:
//  void dg_forthpinsrdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist xmmtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  xmmtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   32BIT
//
//  Alternative way to set the data size:
//   4 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 3 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   32BIT                         sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 4
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PINSRD instruction. This opcode sequence copies a 32 bit integer from
//   a regular register or memory source and puts it into an xmm register
//   destination. The index of the 32 bit integer copied is the immediate
//   target's value.
//   Source register size or memory data size is ignored for this compiling word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM1  PINSRD,  //  [RAX][31:0] ->  XMM1[31:0]
//
//  1 N  RAX[R]  XMM2  PINSRD,  //  [RAX][31:0] ->  XMM2[63:32]
//
//  3 N  ECX  XMM5  PINSRD,     //  RCX[31:0]   ->  XMM5[95:64]
//
// Note:
//  Reverse not supported for this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpinsrdcomma ( VPINSRD, )
//
// C prototype:
//  void dg_forthvpinsrdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist xmmtargetparameterlist xmmtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  xmmtargetparameterlist
//  xmmtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   32BIT
//
//  Alternative way to set the data size:
//   4 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 15 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPINRRD instruction. This opcode sequence copies a 32 bit value from
//   a regular register or memory source, overwrites a 32 bit value in an xmm
//   register second source at the index from the immediate target's value and
//   then puts the result into another xmm register destination.
//   Source register size is ignored for this compiling word.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM2  XMM1   VPINSRD,  //  [RAX][31:0]  -> XMM1[31:0]
//                                      //  XMM2[127:32] -> XMM1[127:32]
//
//  1 N  RAX[R]  XMM1  XMM2  VPINSRD,   //  [RAX][31:0]  -> XMM2[63:32]
//                                      //  XMM1[31:0]   -> XMM2[31:0]
//                                      //  XMM1[127:64] -> XMM2[127:64]
//
//  3 N  ECX  XMM3  XMM5  VPINSRD,      //  ECX[31:0]    -> XMM5[127:96]
//                                      //  XMM1[95:0]   -> XMM5[95:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpinsrqcomma ( PINSRQ, )
//
// C prototype:
//  void dg_forthpinsrqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist xmmtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  xmmtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   64BIT
//
//  Alternative way to set the data size:
//   8 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 or 1 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   64BIT                         sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 8
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PINSRQ instruction. This opcode sequence copies a 64 bit integer from
//   an xmm register source and puts it into a regular register or memory
//   destination. The index of the 64 bit integer copied is the immediate
//   target's value.
//   Source register size or memory data size is ignored for this compiling word.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM1  PINSRQ,  //  [RAX][63:0] -> XMM1[63:0]
//
//  1 N  RAX[R]  XMM2  PINSRQ,  //  [RAX][63:0] -> XMM2[127:64]
//
//  1 N RCX  XMM5  PINSRQ,      //  RCX[63:0]   -> XMM5[127:64]
//
// Note:
//  Reverse not supported for this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpinsrqcomma ( VPINSRQ, )
//
// C prototype:
//  void dg_forthvpinsrqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist xmmtargetparameterlist xmmtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  xmmtargetparameterlist
//  xmmtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   64BIT
//
//  Alternative way to set the data size:
//   8 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 15 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPINRRQ instruction. This opcode sequence copies a 64 bit value from
//   a regular register or memory source, overwrites a 64 bit value in an xmm
//   register second source at the index from the immediate target's value and
//   then puts the result into another xmm register destination.
//   Source register size is ignored for this compiling word.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM2  XMM1   VPINSRQ,  //  [RAX][63:0]  -> XMM1[63:0]
//                                      //  XMM2[127:64] -> XMM1[127:64]
//
//  1 N  RAX[R]  XMM1  XMM2  VPINSRQ,   //  [RAX][63:0]  -> XMM2[127:64]
//                                      //  XMM1[63:0]   -> XMM2[63:0]
//
//  1 N  RCX  XMM3  XMM5  VPINSRD,      //  RCX[63:0]    -> XMM5[127:64]
//                                      //  XMM1[63:0]   -> XMM5[63:0]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpinsrwcomma ( PINSRW, )
//
// C prototype:
//  void dg_forthpinsrwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist targetxparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  targetxparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   16BIT
//
//  Alternative way to set the data size:
//   2 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 7 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   16BIT                         sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 2
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after memory target
//                                 parameters and can not come in the middle
//                                 of memory target parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 PINSRW instruction. This opcode sequence copies a 16 bit integer from
//   the source to the destination.
//   The source can be a regular register or a memory target.
//   The destination can be a floating point or xmm register.
//   The index of the 16 bit integer copied is the immediate target's value.
//   Source register size or memory data size is ignored for this compiling word.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM1  PINSRW,  //  [RAX][15:0] -> XMM1[15:0]
//
//  0 N  RAX[R]  ST1   PINSRW,  //  [RAX][15:0] -> ST1[15:0]
//
//  1 N  RAX[R]  XMM2  PINSRW,  //  [RAX][15:0] -> XMM2[31:16]
//
//  3 N  CX  XMM5      PINSRW,  //  CX[15:0]    -> XMM5[63:48]
//
//  3 N  CX  ST5       PINSRW,  //  CX[15:0]    -> ST5[63:48]
//
//  7 N  DX  XMM4      PINSRW,  //  DX[15:0]    -> XMM4[127:112]
//
// Note:
//  Reverse not supported for this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpinsrwcomma ( VPINSRW, )
//
// C prototype:
//  void dg_forthvpinsrwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( immediatetargetparameterlist rormtargetparameterlist xmmtargetparameterlist xmmtargetparameterlist -- )
//
// Data stack in:
//
//  immediatetargetparameterlist
//  rormtargetparameterlist
//  xmmtargetparameterlist
//  xmmtargetparameterlist
//
//
//  The parameter list for the immediatetargetparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//
//  The parameter list for the xmmtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rormtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to explicitly declare the data size for a memory target you can use this,
//   however it is not required:
//   16BIT
//
//  Alternative way to set the data size:
//   2 DATASIZE
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value (0 to 15 for this instruction)
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after the memory target
//                                 parameters and can not come in the middle
//                                 of the memory target parameters.
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of the instruction in bytes,
//                                 can be 1
//   DATASIZE                     sets the data size of the instruction
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPINRRW instruction. This opcode sequence copies a 16 bit value from
//   a regular register or memory source, overwrites a 16 bit value in an xmm
//   register second source at the index from the immediate target's value and
//   then puts the result into another xmm register destination.
//   Source register size is ignored for this compiling word.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  RAX[R]  XMM2  XMM1   VPINSRW,  //  [RAX][15:0]  -> XMM1[15:0]
//                                      //  XMM2[127:16] -> XMM1[127:16]
//
//  1 N  RAX[R]  XMM1  XMM2  VPINSRW,   //  [RAX][15:0]  -> XMM2[31:16]
//                                      //  XMM1[15:0]   -> XMM2[15:0]
//                                      //  XMM1[127:32] -> XMM2[127:32]
//
//  3 N  CX  XMM3  XMM5  VPINSRW,       //  CX[15:0]     -> XMM5[79:64]
//                                      //  XMM1[63:0]   -> XMM5[63:0]
//                                      //  XMM1[127:80] -> XMM5[127:80]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmaddwdcomma ( PMADDWD, )
//
// C prototype:
//  void dg_forthpmaddwdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   floatingpointregister FPSR
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   FPSR                         specifies a floating point stack register
//                                 FPSR is optional.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMADDWD instruction. This opcode sequence multiplies each signed
//   16 bit integer from the destination with the corresponding signed 16 bit
//   signed integer from the source, then adds adjacent pairs of results
//   together to form signed 32 bit integer final results. The final results
//   are then put into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  PMADDWD,    // ( XMM1[15:0]    * [RBX][15:0]    ) +
//                             // ( XMM1[31:16]   * [RBX][31:16]   ) -> XMM1[31:0]
//                             // ( XMM1[47:32]   * [RBX][47:32]   ) +
//                             // ( XMM1[63:48]   * [RBX][63:48]   ) -> XMM1[63:32]
/                              // ( XMM1[79:63]   * [RBX][79:63]   ) +
//                             // ( XMM1[95:80]   * [RBX][95:80]   ) -> XMM1[95:63]
//                             // ( XMM1[111:96]  * [RBX][111:96]  ) +
//                             // ( XMM1[127:112] * [RBX][127:112] ) -> XMM1[127:96]
//
//  RBX [R]  ST1  PMADDWD,     // ( ST1[15:0]    * [RBX][15:0]    ) +
//                             // ( ST1[31:16]   * [RBX][31:16]   ) -> ST1[31:0]
//                             // ( ST1[47:32]   * [RBX][47:32]   ) +
//                             // ( ST1[63:48]   * [RBX][63:48]   ) -> ST1[63:32]
//
//  XMM2  XMM1  PMADDWD,       // ( XMM1[15:0]    * XMM2[15:0]    ) +
//                             // ( XMM1[31:16]   * XMM2[31:16]   ) -> XMM1[31:0]
//                             // ( XMM1[47:32]   * XMM2[47:32]   ) +
//                             // ( XMM1[63:48]   * XMM2[63:48]   ) -> XMM1[63:32]
/                              // ( XMM1[79:63]   * XMM2[79:63]   ) +
//                             // ( XMM1[95:80]   * XMM2[95:80]   ) -> XMM1[95:63]
//                             // ( XMM1[111:96]  * XMM2[111:96]  ) +
//                             // ( XMM1[127:112] * XMM2[127:112] ) -> XMM1[127:96]
//
//  ST2  ST1  PMADDWD,         // ( ST1[15:0]    * ST2[15:0]    ) +
//                             // ( ST1[31:16]   * ST2[31:16]   ) -> ST1[31:0]
//                             // ( ST1[47:32]   * ST2[47:32]   ) +
//                             // ( ST1[63:48]   * ST2[63:48]   ) -> ST1[63:32]
//
//  XMM1 <-  XMM2  PMADDWD, // ( XMM1[15:0]    * XMM2[15:0]    ) +
//                             // ( XMM1[31:16]   * XMM2[31:16]   ) -> XMM1[31:0]
//                             // ( XMM1[47:32]   * XMM2[47:32]   ) +
//                             // ( XMM1[63:48]   * XMM2[63:48]   ) -> XMM1[63:32]
/                              // ( XMM1[79:63]   * XMM2[79:63]   ) +
//                             // ( XMM1[95:80]   * XMM2[95:80]   ) -> XMM1[95:63]
//                             // ( XMM1[111:96]  * XMM2[111:96]  ) +
//                             // ( XMM1[127:112] * XMM2[127:112] ) -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or floating point register. If the source is not memory, then it
//   must be the same type of register as the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpmovmskbcomma ( PMOVMSKB, )
//
// C prototype:
//  void dg_forthpmovmskbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist rtargetparameterlist  -- )
//
// Data stack in:
//
//  targetxparameterlist
//  rtargetparameterlist
//
//  The parameter list for the targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//
//  The parameter list for the rtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   R                            specifies a register target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PMOVMSKB instruction. This opcode sequence copies the high bit of
//   each byte of the source floating point or xmm register to the destination
//   register. The destination value is packed into one or two bytes depending
//   on whether the source is a floating point or xmm register.
//   The destination value is zero extended to the address mode size.
//   Destination register size is ignored for this compiling word unless
//   you use a 64 bit register with R. If you use a 64 bit register with R,
//   the REX.W prefix is compiled. The Intel docs don't say what it will do,
//   but on my icore-3 it's ignored. J.N. 3/30/2020
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//   XMM1  AX  PMOVMSKB,     //  XMM1[7]   -> AX[0]
//                           //  XMM1[15]  -> AX[1]
//                           //  XMM1[23]  -> AX[2]
//                           //  XMM1[31]  -> AX[3]
//                           //  XMM1[39]  -> AX[4]
//                           //  XMM1[47]  -> AX[5]
//                           //  XMM1[55]  -> AX[6]
//                           //  XMM1[63]  -> AX[7]
//                           //  XMM1[71]  -> AX[8]
//                           //  XMM1[79]  -> AX[9]
//                           //  XMM1[87]  -> AX[10]
//                           //  XMM1[95]  -> AX[11]
//                           //  XMM1[103] -> AX[12]
//                           //  XMM1[111] -> AX[13]
//                           //  XMM1[119] -> AX[14]
//                           //  XMM1[127] -> AX[15]
//                           //          0 -> RAX[63:16]
//
//   ST2  AL  PMOVMSKB,      //  ST2[7]   -> AL[0]
//                           //  ST2[15]  -> AL[1]
//                           //  ST2[23]  -> AL[2]
//                           //  ST2[31]  -> AL[3]
//                           //  ST2[39]  -> AL[4]
//                           //  ST2[47]  -> AL[5]
//                           //  ST2[55]  -> AL[6]
//                           //  ST2[63]  -> AL[7]
//                           //          0 -> RAX[63:8]
//
//  ST2  RAX  PMOVMSKB,      // same as above, (still no REX.W prefix.)
//  ST2  RAX R  PMOVMSKB,    // REX.W prefix gets compiled, but it's still
//                           //  the same.
// Note:
//  Reverse not supported for this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpmovmskbcomma ( VPMOVMSKB, )
//
// C prototype:
//  void dg_forthvpmovmskbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist  -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the targetxparameterlist can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  The parameter list for the rtargetparameterlist can contain these addressing mode
//   specifiers:
//
//   targetregister
//   targetregister R
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//
//   R                            specifies a register target.
//   XMMR                         specifies an XMM register target.
//   YMMR                         specifies a YMM register target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPMOVMSKB instruction. This opcode sequence copies the high bit of
//   each byte of the source xmm or ymm register to the destination
//   register. The destination value is packed into two or four bytes depending
//   on whether the source is an xmm or ymm register.
//   The destination value is zero extended to the address mode size.
//   Destination register size is ignored for this compiling word unless
//   you use a 64 bit register as the destination. If you use a 64 bit register,
//   the VEX.W is set but the Intel docs say it will be ignored.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//   XMM1  AX  VPMOVMSKB,    //  XMM1[7]   -> AX[0]
//                           //  XMM1[15]  -> AX[1]
//                           //  XMM1[23]  -> AX[2]
//                           //  XMM1[31]  -> AX[3]
//                           //  XMM1[39]  -> AX[4]
//                           //  XMM1[47]  -> AX[5]
//                           //  XMM1[55]  -> AX[6]
//                           //  XMM1[63]  -> AX[7]
//                           //  XMM1[71]  -> AX[8]
//                           //  XMM1[79]  -> AX[9]
//                           //  XMM1[87]  -> AX[10]
//                           //  XMM1[95]  -> AX[11]
//                           //  XMM1[103] -> AX[12]
//                           //  XMM1[111] -> AX[13]
//                           //  XMM1[119] -> AX[14]
//                           //  XMM1[127] -> AX[15]
//                           //          0 -> RAX[63:16]
//
//  HEX
//   YMM1  AX  VPMOVMSKB,    //  YMM1[7]   -> EAX[0]
//                           //  YMM1[15]  -> EAX[1]
//                           //  YMM1[23]  -> EAX[2]
//                           //  YMM1[31]  -> EAX[3]
//                           //  YMM1[39]  -> EAX[4]
//                           //  YMM1[47]  -> EAX[5]
//                           //  YMM1[55]  -> EAX[6]
//                           //  YMM1[63]  -> EAX[7]
//                           //  YMM1[71]  -> EAX[8]
//                           //  YMM1[79]  -> EAX[9]
//                           //  YMM1[87]  -> EAX[10]
//                           //  YMM1[95]  -> EAX[11]
//                           //  YMM1[103] -> EAX[12]
//                           //  YMM1[111] -> EAX[13]
//                           //  YMM1[119] -> EAX[14]
//                           //  YMM1[127] -> EAX[15]
//                           //  YMM1[135] -> EAX[16]
//                           //  YMM1[153] -> EAX[17]
//                           //  YMM1[161] -> EAX[18]
// ...
//                           //  YMM1[255] -> EAX[31]
//                           //          0 -> RAX[63:32]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpopcntcomma ( POPCNT, )
//
// C prototype:
//  void dg_forthpopcntcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the destination register.)
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  2, 4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 POPCNT instruction. This opcode sequence counts the number of bits
//   that are set in the source and puts the count into the destination.
//   The destination must be a regular register. The source can be a regular
//   register or memory. The source and the destination must be the same size.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  RAX [R]  CX  POPCNT,   // Counttheonesin( [RAX][15:0] ) -> CX
//
//  RAX [R]  ECX  POPCNT,  // Counttheonesin( [RAX][31:0] ) -> ECX
//                         //  in 64 bit mode the upper 32 bits of RCX are cleared
//
//  RAX [R]  RCX  POPCNT,  // Counttheonesin( [RAX][63:0] ) -> RCX
//
//  AX  CX  POPCNT,        // Counttheonesin( AX[15:0] ) -> CX
//
//  EAX  ECX  POPCNT,      // Counttheonesin( EAX[15:0] ) -> ECX
//                         //  in 64 bit mode the upper 32 bits of RCX are cleared
//
//  RAX  RCX  POPCNT,      // Counttheonesin( RAX[15:0] ) -> RCX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpslldcomma ( PSLLD, )
//
// C prototype:
//  void dg_forthpslldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSLLD instruction. This opcode sequence logical shifts each 32 bit
//   value in the destination xmm or floating point register left the number
//   of bits specified in the source value. If the source value is greater than
//   32, the destination is cleared. The source count can come from the same
//   type of register as the destination, an 8 bit immediate value, or memory.
//   If the source is memory, a 128 bit value is read from memory, but only the
//   lower 64 bits are used as the count.
//   (Zeros are shifted in from the right.)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSLLD,           //  nothing happens
//
//  1 N  XMM2  PSLLD,           // XMM2[31:0]   << 1 -> XMM2[31:0]
//                              // XMM2[63:32]  << 1 -> XMM2[63:31]
//                              // XMM2[95:64]  << 1 -> XMM2[95:64]
//                              // XMM2[127:96] << 1 -> XMM2[127:96]
//
//  1 N  ST2  PSLLD,            // ST2[31:0]   << 1 -> ST2[31:0]
//                              // ST2[63:32]  << 1 -> ST2[63:31]
//
//  2 N  XMM2  PSLLD,           // XMM2[31:0]   << 2 -> XMM2[31:0]
//                              // XMM2[63:32]  << 2 -> XMM2[63:31]
//                              // XMM2[95:64]  << 2 -> XMM2[95:64]
//                              // XMM2[127:96] << 2 -> XMM2[127:96]
//
//  2 N  ST2  PSLLD,            // ST2[31:0]   < < 2 -> ST2[31:0]
//                              // ST2[63:32]  < < 2 -> ST2[63:31]
//
//  32 N  XMM2  PSLLD,          // 0 -> XMM2[127:0]
//
//  XMM0  XMM2  PSLLD,          // XMM2[31:0]   << XMM0 -> XMM2[31:0]
//                              // XMM2[63:32]  << XMM0 -> XMM2[63:31]
//                              // XMM2[95:64]  << XMM0 -> XMM2[95:64]
//                              // XMM2[127:96] << XMM0 -> XMM2[127:96]
//
//  RAX [R]  XMM2  PSLLD,       // XMM2[31:0]   << [RAX][63:0]
//                              //  -> XMM2[31:0]
//                              // XMM2[63:32]  << [RAX][63:0]
//                              //  -> XMM2[63:31]
//                              // XMM2[95:64]  << [RAX][63:0]
//                              //  -> XMM2[95:64]
//                              // XMM2[127:96] << [RAX][63:0]
//                              //  -> XMM2[127:96]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word... I think if you tried and wanted it to work you
//   would have to set it to 128 bits for an XMM register destination and
//   to the address mode size for a floating point register destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsllqcomma ( PSLLQ, )
//
// C prototype:
//  void dg_forthpsllqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSLLQ instruction. This opcode sequence logical shifts each 64 bit
//   value in the destination xmm register or floating point register left the
//   number of bits specified in the source value. If the source value is
//   greater than 63, the destination is cleared. The source count can come
//   from the same type of register as the destination, an 8 bit immediate
//   value, or memory. If the source is memory, a 128 bit value is read from
//   memory, but only the lower 64 bits are used as the count.
//   (Zeros are shifted in from the right.)
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSLLQ,           //  nothing happens
//
//  1 N  XMM2  PSLLQ,           // XMM2[63:0]    << 1 -> XMM2[63:0]
//                              // XMM2[127:64]  << 1 -> XMM2[127:64]
//
//  1 N  ST2  PSLLQ,            // ST2[63:0]     << 1 -> ST2[63:0]
//
//  2 N  XMM2  PSLLQ,           // XMM2[63:0]    << 2 -> XMM2[63:0]
//                              // XMM2[127:64]  << 2 -> XMM2[127:64]
//
//  2 N  ST2  PSLLQ,            // ST2[63:0]     << 2 -> ST2[63:0]
//
//  64 N  XMM2  PSLLQ,          // 0 -> XMM2[127:0]
//
//  XMM0  XMM2  PSLLQ,          // XMM2[63:0]   << XMM0 -> XMM2[63:0]
//                              // XMM2[127:64] << XMM0 -> XMM2[127:64]
//
//  RAX [R]  XMM2  PSLLQ,       // XMM2[63:0]   << [RAX][63:0]
//                              //  -> XMM2[63:0]
//                              // XMM2[127:64] << [RAX][63:0]
//                              //  -> XMM2[127:64]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word... I think if you tried and wanted it to work you
//   would have to set it to 128 bits for an XMM register destination and
//   to the address mode size for a floating point register destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsllwcomma ( PSLLW, )
//
// C prototype:
//  void dg_forthpsllwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSLLW instruction. This opcode sequence logical shifts each 16 bit
//   value in the destination xmm register or floating point register left the
//   number of bits specified in the source value. (Zeros are
//   shifted in from the right). If the source value is greater than 15, the
//   destination is cleared. The source count can come from the same type of
//   register as the destination, an 8 bit immediate value,
//   or memory. If the source is memory, a 128 bit value is read from memory,
//   but only the lower 64 bits are used as the count.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSLLW,           //  nothing happens
//
//  1 N  XMM2  PSLLW,           // XMM2[15:0]    << 1 -> XMM2[15:0]
//                              // XMM2[31:16]   << 1 -> XMM2[31:16]
//                              // XMM2[47:32]   << 1 -> XMM2[47:32]
//                              // XMM2[63:48]   << 1 -> XMM2[63:48]
//                              // XMM2[79:64]   << 1 -> XMM2[79:64]
//                              // XMM2[95:80]   << 1 -> XMM2[95:80]
//                              // XMM2[111:96]  << 1 -> XMM2[111:96]
//                              // XMM2[127:112] << 1 -> XMM2[127:112]
//
//  1 N  ST2  PSLLW,            // ST2[15:0]    << 1 -> ST2[15:0]
//                              // ST2[31:16]   << 1 -> ST2[31:16]
//                              // ST2[47:32]   << 1 -> ST2[47:32]
//                              // ST2[63:48]   << 1 -> ST2[63:48]
//                              // ST2[79:64]   << 1 -> ST2[79:64]
//                              // ST2[95:80]   << 1 -> ST2[95:80]
//                              // ST2[111:96]  << 1 -> ST2[111:96]
//                              // ST2[127:112] << 1 -> ST2[127:112]
//
//  2 N  XMM2  PSLLW,           // XMM2[15:0]    << 2 -> XMM2[15:0]
//                              // XMM2[31:16]   << 2 -> XMM2[31:16]
//                              // XMM2[47:32]   << 2 -> XMM2[47:32]
//                              // XMM2[63:48]   << 2 -> XMM2[63:48]
//                              // XMM2[79:64]   << 2 -> XMM2[79:64]
//                              // XMM2[95:80]   << 2 -> XMM2[95:80]
//                              // XMM2[111:96]  << 2 -> XMM2[111:96]
//                              // XMM2[127:112] << 2 -> XMM2[127:112]
//
//  2 N  ST2  PSLLW,            // ST2[15:0]    << 2 -> ST2[15:0]
//                              // ST2[31:16]   << 2 -> ST2[31:16]
//                              // ST2[47:32]   << 2 -> ST2[47:32]
//                              // ST2[63:48]   << 2 -> ST2[63:48]
//                              // ST2[79:64]   << 2 -> ST2[79:64]
//                              // ST2[95:80]   << 2 -> ST2[95:80]
//                              // ST2[111:96]  << 2 -> ST2[111:96]
//                              // ST2[127:112] << 2 -> ST2[127:112]
//
//  16 N  XMM2  PSLLW,          // 0 -> XMM2[127:0]
//
//  XMM0  XMM2  PSLLW,          // XMM2[15:0]    << XMM0
//                              //  -> XMM2[15:0]
//                              // XMM2[31:16]   << XMM0
//                              //  -> XMM2[31:16]
//                              // XMM2[47:32]   << XMM0
//                              //  -> XMM2[47:32]
//                              // XMM2[63:48]   << XMM0
//                              //  -> XMM2[63:48]
//                              // XMM2[79:64]   << XMM0
//                              //  -> XMM2[79:64]
//                              // XMM2[95:80]   << XMM0
//                              //  -> XMM2[95:80]
//                              // XMM2[111:96]  << XMM0
//                              //  -> XMM2[111:96]
//                              // XMM2[127:112] << XMM0
//                              //  -> XMM2[127:112]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word... I think if you tried and wanted it to work you
//   would have to set it to 128 bits for an XMM register destination and
//   to the address mode size for a floating point register destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsradcomma ( PSRAD, )
//
// C prototype:
//  void dg_forthpsradcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSRAD instruction. This opcode sequence arithmetically shifts each
//   signed 32 bit value in the destination xmm or floating point register
//   right the number of bits specified in the source value. The source count
//   can come from the same type of register as the destination, an 8 bit
//   immediate value, or memory. If the source is memory, a 128 bit value is
//   read from memory, but only the lower 64 bits are used as the count.
//   The sign bit (highest bit of each 32 bit value) is copied and shifted
//   in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSRAD,           //  nothing happens
//
//  1 N  XMM2  PSRAD,           // XMM2[31:0]   >> 1 -> XMM2[31:0]
//                              // XMM2[63:32]  >> 1 -> XMM2[63:31]
//                              // XMM2[95:64]  >> 1 -> XMM2[95:64]
//                              // XMM2[127:96] >> 1 -> XMM2[127:96]
//
//  1 N  ST2  PSRAD,            // ST2[31:0]   >> 1 -> ST2[31:0]
//                              // ST2[63:32]  >> 1 -> ST2[63:31]
//
//  2 N  XMM2  PSRAD,           // XMM2[31:0]   >> 2 -> XMM2[31:0]
//                              // XMM2[63:32]  >> 2 -> XMM2[63:31]
//                              // XMM2[95:64]  >> 2 -> XMM2[95:64]
//                              // XMM2[127:96] >> 2 -> XMM2[127:96]
//
//  2 N  ST2  PSRAD,            // ST2[31:0]   >> 2 -> ST2[31:0]
//                              // ST2[63:32]  >> 2 -> ST2[63:31]
//
//  XMM0  XMM2  PSRAD,          // XMM2[31:0]   >> XMM0 -> XMM2[31:0]
//                              // XMM2[63:32]  >> XMM0 -> XMM2[63:31]
//                              // XMM2[95:64]  >> XMM0 -> XMM2[95:64]
//                              // XMM2[127:96] >> XMM0 -> XMM2[127:96]
//
//  RAX [R]  XMM2  PSRAD,       // XMM2[31:0]   >> [RAX][63:0]
//                              //  -> XMM2[31:0]
//                              // XMM2[63:32]  >> [RAX][63:0]
//                              //  -> XMM2[63:31]
//                              // XMM2[95:64]  >> [RAX][63:0]
//                              //  -> XMM2[95:64]
//                              // XMM2[127:96] >> [RAX][63:0]
//                              //  -> XMM2[127:96]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsrawcomma ( PSRAW, )
//
// C prototype:
//  void dg_forthpsrawcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSRAW instruction. This opcode sequence arithmetically shifts each
//   signed 16 bit value in the destination xmm register or floating point
//   register right the number of bits specified in the source value.
//   The source count can come from the same type of
//   register as the destination, an 8 bit immediate value,
//   or memory. If the source is memory, a 128 bit value is read from memory,
//   but only the lower 64 bits are used as the count.
//   The sign bit (highest bit of each 16 bit value) is copied and shifted
//   in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSRAW,           //  nothing happens
//
//  1 N  XMM2  PSRAW,           // XMM2[15:0]    >> 1 -> XMM2[15:0]
//                              // XMM2[31:16]   >> 1 -> XMM2[31:16]
//                              // XMM2[47:32]   >> 1 -> XMM2[47:32]
//                              // XMM2[63:48]   >> 1 -> XMM2[63:48]
//                              // XMM2[79:64]   >> 1 -> XMM2[79:64]
//                              // XMM2[95:80]   >> 1 -> XMM2[95:80]
//                              // XMM2[111:96]  >> 1 -> XMM2[111:96]
//                              // XMM2[127:112] >> 1 -> XMM2[127:112]
//
//  1 N  ST2  PSRAW,            // ST2[15:0]    >> 1 -> ST2[15:0]
//                              // ST2[31:16]   >> 1 -> ST2[31:16]
//                              // ST2[47:32]   >> 1 -> ST2[47:32]
//                              // ST2[63:48]   >> 1 -> ST2[63:48]
//                              // ST2[79:64]   >> 1 -> ST2[79:64]
//                              // ST2[95:80]   >> 1 -> ST2[95:80]
//                              // ST2[111:96]  >> 1 -> ST2[111:96]
//                              // ST2[127:112] >> 1 -> ST2[127:112]
//
//  2 N  XMM2  PSRAW,           // XMM2[15:0]    >> 2 -> XMM2[15:0]
//                              // XMM2[31:16]   >> 2 -> XMM2[31:16]
//                              // XMM2[47:32]   >> 2 -> XMM2[47:32]
//                              // XMM2[63:48]   >> 2 -> XMM2[63:48]
//                              // XMM2[79:64]   >> 2 -> XMM2[79:64]
//                              // XMM2[95:80]   >> 2 -> XMM2[95:80]
//                              // XMM2[111:96]  >> 2 -> XMM2[111:96]
//                              // XMM2[127:112] >> 2 -> XMM2[127:112]
//
//  2 N  ST2  PSRAW,            // ST2[15:0]    >> 2 -> ST2[15:0]
//                              // ST2[31:16]   >> 2 -> ST2[31:16]
//                              // ST2[47:32]   >> 2 -> ST2[47:32]
//                              // ST2[63:48]   >> 2 -> ST2[63:48]
//                              // ST2[79:64]   >> 2 -> ST2[79:64]
//                              // ST2[95:80]   >> 2 -> ST2[95:80]
//                              // ST2[111:96]  >> 2 -> ST2[111:96]
//                              // ST2[127:112] >> 2 -> ST2[127:112]
//
//  XMM0  XMM2  PSRAW,          // XMM2[15:0]    >> XMM0
//                              //  -> XMM2[15:0]
//                              // XMM2[31:16]   >> XMM0
//                              //  -> XMM2[31:16]
//                              // XMM2[47:32]   >> XMM0
//                              //  -> XMM2[47:32]
//                              // XMM2[63:48]   >> XMM0
//                              //  -> XMM2[63:48]
//                              // XMM2[79:64]   >> XMM0
//                              //  -> XMM2[79:64]
//                              // XMM2[95:80]   >> XMM0
//                              //  -> XMM2[95:80]
//                              // XMM2[111:96]  >> XMM0
//                              //  -> XMM2[111:96]
//                              // XMM2[127:112] >> XMM0
//                              //  -> XMM2[127:112]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsrlwcomma ( PSRLW, )
//
// C prototype:
//  void dg_forthpsrlwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSRLW instruction. This opcode sequence arithmetically shifts each
//   unsigned 16 bit value in the destination xmm register or floating point
//   register right the number of bits specified in the source value.
//   The source count can come from the same type of
//   register as the destination, an 8 bit immediate value,
//   or memory. If the source is memory, a 128 bit value is read from memory,
//   but only the lower 64 bits are used as the count.
//   Zeros are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSRLW,           //  nothing happens
//
//  1 N  XMM2  PSRLW,           // XMM2[15:0]    >> 1 -> XMM2[15:0]
//                              // XMM2[31:16]   >> 1 -> XMM2[31:16]
//                              // XMM2[47:32]   >> 1 -> XMM2[47:32]
//                              // XMM2[63:48]   >> 1 -> XMM2[63:48]
//                              // XMM2[79:64]   >> 1 -> XMM2[79:64]
//                              // XMM2[95:80]   >> 1 -> XMM2[95:80]
//                              // XMM2[111:96]  >> 1 -> XMM2[111:96]
//                              // XMM2[127:112] >> 1 -> XMM2[127:112]
//
//  1 N  ST2  PSRLW,            // ST2[15:0]    >> 1 -> ST2[15:0]
//                              // ST2[31:16]   >> 1 -> ST2[31:16]
//                              // ST2[47:32]   >> 1 -> ST2[47:32]
//                              // ST2[63:48]   >> 1 -> ST2[63:48]
//                              // ST2[79:64]   >> 1 -> ST2[79:64]
//                              // ST2[95:80]   >> 1 -> ST2[95:80]
//                              // ST2[111:96]  >> 1 -> ST2[111:96]
//                              // ST2[127:112] >> 1 -> ST2[127:112]
//
//  2 N  XMM2  PSRLW,           // XMM2[15:0]    >> 2 -> XMM2[15:0]
//                              // XMM2[31:16]   >> 2 -> XMM2[31:16]
//                              // XMM2[47:32]   >> 2 -> XMM2[47:32]
//                              // XMM2[63:48]   >> 2 -> XMM2[63:48]
//                              // XMM2[79:64]   >> 2 -> XMM2[79:64]
//                              // XMM2[95:80]   >> 2 -> XMM2[95:80]
//                              // XMM2[111:96]  >> 2 -> XMM2[111:96]
//                              // XMM2[127:112] >> 2 -> XMM2[127:112]
//
//  2 N  ST2  PSRLW,            // ST2[15:0]    >> 2 -> ST2[15:0]
//                              // ST2[31:16]   >> 2 -> ST2[31:16]
//                              // ST2[47:32]   >> 2 -> ST2[47:32]
//                              // ST2[63:48]   >> 2 -> ST2[63:48]
//                              // ST2[79:64]   >> 2 -> ST2[79:64]
//                              // ST2[95:80]   >> 2 -> ST2[95:80]
//                              // ST2[111:96]  >> 2 -> ST2[111:96]
//                              // ST2[127:112] >> 2 -> ST2[127:112]
//
//  XMM0  XMM2  PSRLW,          // XMM2[15:0]    >> XMM0
//                              //  -> XMM2[15:0]
//                              // XMM2[31:16]   >> XMM0
//                              //  -> XMM2[31:16]
//                              // XMM2[47:32]   >> XMM0
//                              //  -> XMM2[47:32]
//                              // XMM2[63:48]   >> XMM0
//                              //  -> XMM2[63:48]
//                              // XMM2[79:64]   >> XMM0
//                              //  -> XMM2[79:64]
//                              // XMM2[95:80]   >> XMM0
//                              //  -> XMM2[95:80]
//                              // XMM2[111:96]  >> XMM0
//                              //  -> XMM2[111:96]
//                              // XMM2[127:112] >> XMM0
//                              //  -> XMM2[127:112]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsrldcomma ( PSRLD, )
//
// C prototype:
//  void dg_forthpsrldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSRLD instruction. This opcode sequence arithmetically shifts each
//   unsigned 32 bit value in the destination xmm or floating point register
//   right the number of bits specified in the source value. The source count
//   can come from the same type of register as the destination, an 8 bit
//   immediate value, or memory. If the source is memory, a 128 bit value is
//   read from memory, but only the lower 64 bits are used as the count.
//   Zeros are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSRLD,           //  nothing happens
//
//  1 N  XMM2  PSRLD,           // XMM2[31:0]   >> 1 -> XMM2[31:0]
//                              // XMM2[63:32]  >> 1 -> XMM2[63:31]
//                              // XMM2[95:64]  >> 1 -> XMM2[95:64]
//                              // XMM2[127:96] >> 1 -> XMM2[127:96]
//
//  1 N  ST2  PSRLD,            // ST2[31:0]   >> 1 -> ST2[31:0]
//                              // ST2[63:32]  >> 1 -> ST2[63:31]
//
//  2 N  XMM2  PSRLD,           // XMM2[31:0]   >> 2 -> XMM2[31:0]
//                              // XMM2[63:32]  >> 2 -> XMM2[63:31]
//                              // XMM2[95:64]  >> 2 -> XMM2[95:64]
//                              // XMM2[127:96] >> 2 -> XMM2[127:96]
//
//  2 N  ST2  PSRLD,            // ST2[31:0]   >> 2 -> ST2[31:0]
//                              // ST2[63:32]  >> 2 -> ST2[63:31]
//
//  XMM0  XMM2  PSRLD,          // XMM2[31:0]   >> XMM0 -> XMM2[31:0]
//                              // XMM2[63:32]  >> XMM0 -> XMM2[63:31]
//                              // XMM2[95:64]  >> XMM0 -> XMM2[95:64]
//                              // XMM2[127:96] >> XMM0 -> XMM2[127:96]
//
//  RAX [R]  XMM2  PSRLD,       // XMM2[31:0]   >> [RAX][63:0]
//                              //  -> XMM2[31:0]
//                              // XMM2[63:32]  >> [RAX][63:0]
//                              //  -> XMM2[63:31]
//                              // XMM2[95:64]  >> [RAX][63:0]
//                              //  -> XMM2[95:64]
//                              // XMM2[127:96] >> [RAX][63:0]
//                              //  -> XMM2[127:96]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsrldqcomma ( PSRLDQ, )
//
// C prototype:
//  void dg_forthpsrldqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSRLDQ instruction. This opcode sequence logically shifts each
//   unsigned 128 bit value in the destination xmm register
//   right the number of BYTES specified in the source value. The source must
//   be an 8 bit immediate value. Yes, this is a byte shift, not a bit shift.
//   Zeros are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSRLDQ,          //  nothing happens
//
//  1 N  XMM2  PSRLDQ,           // XMM2[127:0]  >> 8 -> XMM2[127:0]
//
//  2 N  XMM2  PSRLDQ,           // XMM2[127:0] >> 16 -> XMM2[127:0]
//
// Note:
//  Direction is ignored. The compiler can figure out the direction from
//   the targets. The xmm register is always the destination.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpsrlqcomma ( PSRLQ, )
//
// C prototype:
//  void dg_forthpsrlqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter lists for this compiling word can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   floatingpointregister
//   floatingpointregister FPSR
//   targetxmmregister
//   targetxmmregister XMMR
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   floatingpointregister        one of:
//                                 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 PSRLQ instruction. This opcode sequence arithmetically shifts each
//   unsigned 64 bit value in the destination xmm or floating point register
//   right the number of bits specified in the source value. The source count
//   can come from the same type of register as the destination, an 8 bit
//   immediate value, or memory. If the source is memory, a 128 bit value is
//   read from memory, but only the lower 64 bits are used as the count.
//   Zeros are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  PSRLQ,           //  nothing happens
//
//  1 N  XMM2  PSRLQ,           // XMM2[63:0]   >> 1 -> XMM2[63:0]
//                              // XMM2[127:64] >> 1 -> XMM2[127:64]
//
//  1 N  ST2  PSRLQ,            // ST2[63:0]    >> 1 -> ST2[63:0]
//
//  2 N  XMM2  PSRLQ,           // XMM2[63:0]   >> 2 -> XMM2[63:0]
//                              // XMM2[127:64] >> 2 -> XMM2[127:64]
//
//  2 N  ST2  PSRLQ,            // ST2[64:0]    >> 2 -> ST2[64:0]
//
//  XMM0  XMM2  PSRLQ,          // XMM2[63:0]   >> XMM0 -> XMM2[63:0]
//                              // XMM2[127:64] >> XMM0 -> XMM2[127:64]
//
//  RAX [R]  XMM2  PSRLQ,       // XMM2[63:0]   >> [RAX][63:0]
//                              //  -> XMM2[63:0]
//                              // XMM2[127:64] >> [RAX][63:0]
//                              //  -> XMM2[127:64]
//
// Note:
//  Explicitly setting the data size of a memory target is not supported in
//   this compiling word.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd132pdcomma ( VFMADD132PD, )
//
// C prototype:
//  void dg_forthvfmadd132pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD123PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from the source, then adds the result to the corresponding value in
//   target y, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD123PD,     // ([RBX][63:0]   * XMM0[63:0]) +
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//                                      // ([RBX][127:64] * XMM0[127:64]) +
//                                      //  XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMADD123PD,        // (XMM2[63:0]    * XMM0[63:0]) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (XMM2[127:64]  * XMM0[127:64]) +
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMADD123PD,  // (XMM0[63:0]   * XMM2[63:0]) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (XMM0[127:64] * XMM2[127:64]) +
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMADD123PD,         // (YMM0[63:0]    * YMM8[63:0]) +
//                                      //  YMM1[63:0]    -> YMM8[63:0]
//                                      // (YMM0[127:64]  * YMM8[127:64]) +
//                                      //  YMM1[127:64]  -> YMM8[127:64]
//                                      // (YMM0[191:128] * YMM8[191:128]) +
//                                      //  YMM1[191:128] -> YMM8[191:128]
//                                      // (YMM0[255:192] * YMM8[255:192]) +
//                                      //  YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd213pdcomma ( VFMADD213PD, )
//
// C prototype:
//  void dg_forthvfmadd213pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD213PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from target y, then adds the result to the corresponding value in
//   the source, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD213PD,     // (XMM1[63:0]    * XMM0[63:0]) +
//                                      //  [RBX][63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) +
//                                      //  [RBX][127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMADD213PD,        // (XMM1[63:0]    * XMM0[63:0]) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) +
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMADD213PD,  // (XMM1[63:0]   * XMM2[63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * XMM2[127:64]) +
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMADD213PD,         // (YMM1[63:0]    * YMM8[63:0]) +
//                                      //  YMM0[63:0]    -> YMM8[63:0]
//                                      // (YMM1[127:64]  * YMM8[127:64]) +
//                                      //  YMM0[127:64]  -> YMM8[127:64]
//                                      // (YMM1[191:128] * YMM8[191:128]) +
//                                      //  YMM0[191:128] -> YMM8[191:128]
//                                      // (YMM1[255:192] * YMM8[255:192]) +
//                                      //  YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd231pdcomma ( VFMADD231PD, )
//
// C prototype:
//  void dg_forthvfmadd231pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD231PD instruction. This opcode sequence multiples each double
//   precision floating point value from the source with the corresponding
//   one from target y, then adds the result to the corresponding value in
//   the destination, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD231PD,     // (XMM1[63:0]   * [RBX][63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * [RBX][127:64]) +
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMADD231PD,        // (XMM1[63:0]    * XMM2[63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM2[127:64]) +
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMADD231PD,  // (XMM1[63:0]   * XMM0[63:0]) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * XMM0[127:64]) +
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMADD231PD,         // (YMM1[63:0]   * YMM0[63:0]) +
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) +
//                                      //  YMM8[127:64] -> YMM8[127:64]
//                                      // (YMM1[63:0]   * YMM0[63:0]) +
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) +
//                                      //  YMM8[127:64] -> YMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd132pscomma ( VFMADD132PS, )
//
// C prototype:
//  void dg_forthvfmadd132pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD123PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from the source, then adds the result to the corresponding value in
//   target y, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD123PS,     // ([RBX][31:0] * XMM0[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // ([RBX][63:32] * XMM0[63:32]) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // ([RBX][95:64] * XMM0[95:64]) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // ([RBX][127:96] * XMM0[127:96]) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMADD123PS,        // (XMM2[31:0] * XMM0[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (XMM2[63:32] * XMM0[63:32]) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (XMM2[95:64] * XMM0[95:64]) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (XMM2[127:96] * XMM0[127:96]) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMADD123PS,  // (XMM0[31:0] * XMM2[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (XMM0[63:32] * XMM2[63:32]) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (XMM0[95:64] * XMM2[95:64]) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (XMM0[127:96] * XMM2[127:96]) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMADD123PS,         // (YMM0[31:0] * YMM8[31:0]) +
//                                      //  YMM1[31:0] -> YMM8[31:0]
//                                      // (YMM0[63:32] * YMM8[63:32]) +
//                                      //  YMM1[63:32] -> YMM8[63:32]
//                                      // (YMM0[95:64] * YMM8[95:64]) +
//                                      //  YMM1[95:64] -> YMM8[95:64]
//                                      // (YMM0[127:96] * YMM8[127:96]) +
//                                      //  YMM1[127:96] -> YMM8[127:96]
//                                      // (YMM0[159:128] * YMM8[159:128]) +
//                                      //  YMM1[159:128] -> YMM8[159:128]
//                                      // (YMM0[191:160] * YMM8[191:160]) +
//                                      //  YMM1[191:160] -> YMM8[191:160]
//                                      // (YMM0[223:192] * YMM8[223:192]) +
//                                      //  YMM1[223:192] -> YMM8[223:192]
//                                      // (YMM0[255:224] * YMM8[255:224]) +
//                                      //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd213pscomma ( VFMADD213PS, )
//
// C prototype:
//  void dg_forthvfmadd213pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD213PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from target y, then adds the result to the corresponding value in
//   the source, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD213PS,     // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  [RBX][31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) +
//                                      //  [RBX][63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) +
//                                      //  [RBX][95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) +
//                                      //  [RBX][127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMADD213PS,        // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) +
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) +
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) +
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMADD213PS,  // (XMM1[31:0] * XMM2[31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM2[63:32]) +
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM2[95:64]) +
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM2[127:96]) +
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMADD213PS,         // (YMM1[31:0] * YMM8[31:0]) +
//                                      //  YMM0[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM8[63:32]) +
//                                      //  YMM0[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM8[95:64]) +
//                                      //  YMM0[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM8[127:96]) +
//                                      //  YMM0[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM8[159:128]) +
//                                      //  YMM0[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM8[191:160]) +
//                                      //  YMM0[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM8[223:192]) +
//                                      //  YMM0[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM8[255:224]) +
//                                      //  YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd231pscomma ( VFMADD231PS, )
//
// C prototype:
//  void dg_forthvfmadd231pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD231PS instruction. This opcode sequence multiples each single
//   precision floating point value from the source with the corresponding
//   one from target y, then adds the result to the corresponding value in
//   the destination, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD231PS,     // (XMM1[31:0] * [RBX][31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * [RBX][63:32]) +
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * [RBX][95:64]) +
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * [RBX][127:96]) +
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMADD231PS,        // (XMM1[31:0] * XMM2[31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM2[63:32]) +
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM2[95:64]) +
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM2[127:96]) +
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMADD231PS,  // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) +
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) +
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) +
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMADD231PS,         // (YMM1[31:0] * YMM0[31:0]) +
//                                      //  YMM8[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM0[63:32]) +
//                                      //  YMM8[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM0[95:64]) +
//                                      //  YMM8[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM0[127:96]) +
//                                      //  YMM8[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM0[159:128]) +
//                                      //  YMM8[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM0[191:160]) +
//                                      //  YMM8[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM0[223:192]) +
//                                      //  YMM8[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM0[255:224]) +
//                                      //  YMM8[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd132sdcomma ( VFMADD132SD, )
//
// C prototype:
//  void dg_forthvfmadd132sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD123SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination 
//   with the corresponding one from the source, then adds the result to the 
//   corresponding value in target y, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD123SD,     // ([RBX][63:0]   * XMM0[63:0]) +
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFMADD123SD,        // (XMM2[63:0]    * XMM0[63:0]) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFMADD123SD,  // (XMM0[63:0]   * XMM2[63:0]) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFMADD123SD,         // (XMM0[63:0]    * XMM8[63:0]) +
//                                      //  XMM1[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd213sdcomma ( VFMADD213SD, )
//
// C prototype:
//  void dg_forthvfmadd213sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD213SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination with 
//   the corresponding one from target y, then adds the result to the corresponding 
//   value in the source, and then puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD213SD,     // (XMM1[63:0]    * XMM0[63:0]) +
//                                      //  [RBX][63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFMADD213SD,        // (XMM1[63:0]    * XMM0[63:0]) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFMADD213SD,  // (XMM1[63:0]   * XMM2[63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFMADD213SD,         // (XMM1[63:0]    * XMM8[63:0]) +
//                                      //  XMM0[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd231sdcomma ( VFMADD231SD, )
//
// C prototype:
//  void dg_forthvfmadd231sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD231SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the source with 
//   the corresponding one from target y, then adds the result to the 
//   corresponding value in the destination, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD231SD,     // (XMM1[63:0]   * [RBX][63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFMADD231SD,        // (XMM1[63:0]    * XMM2[63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFMADD231SD,  // (XMM1[63:0]   * XMM0[63:0]) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFMADD231SD,         // (XMM1[63:0]   * XMM0[63:0]) +
//                                      //  XMM8[63:0]   -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd132sscomma ( VFMADD132SS, )
//
// C prototype:
//  void dg_forthvfmadd132sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD123SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from the source, then adds the result to the 
//   corresponding value in target y, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD123SS,     // ([RBX][31:0] * XMM0[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFMADD123SS,        // (XMM2[31:0] * XMM0[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFMADD123SS,  // (XMM0[31:0] * XMM2[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFMADD123SS,         // (XMM0[31:0] * XMM8[31:0]) +
//                                      //  XMM1[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd213sscomma ( VFMADD213SS, )
//
// C prototype:
//  void dg_forthvfmadd213sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD213SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from target y, then adds the result to the 
//   corresponding value in the source, and then puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD213SS,     // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  [RBX][31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFMADD213SS,        // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFMADD213SS,  // (XMM1[31:0] * XMM2[31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFMADD213SS,         // (XMM1[31:0] * XMM8[31:0]) +
//                                      //  XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmadd231sscomma ( VFMADD231SS, )
//
// C prototype:
//  void dg_forthvfmadd231sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADD231SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the source with 
//   the corresponding one from target y, then adds the result to the 
//   corresponding value in the destination, and then puts the results into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADD231SS,     // (XMM1[31:0] * [RBX][31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFMADD231SS,        // (XMM1[31:0] * XMM2[31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFMADD231SS,  // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFMADD231SS,         // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  XMM8[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmaddsub132pdcomma ( VFMADDSUB132PD, )
//
// C prototype:
//  void dg_forthvfmaddsub132pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADDSUB123PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from the source, then subtracts the corresponding even indexed values or
//   adds the corresponding odd indexed values in target y from the result, 
//   then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADDSUB123PD,  // ([RBX][63:0]   * XMM0[63:0]) -
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//                                      // ([RBX][127:64] * XMM0[127:64]) +
//                                      //  XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMADDSUB123PD,     // (XMM2[63:0]    * XMM0[63:0]) -
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (XMM2[127:64]  * XMM0[127:64]) +
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMADDSUB123PD, // (XMM0[63:0]   * XMM2[63:0]) -
//                                        //  XMM1[63:0]   -> XMM0[63:0]
//                                        // (XMM0[127:64] * XMM2[127:64]) +
//                                        //  XMM1[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMADDSUB123PD,      // (YMM0[63:0]    * YMM8[63:0]) -
//                                      //  YMM1[63:0]    -> YMM8[63:0]
//                                      // (YMM0[127:64]  * YMM8[127:64]) +
//                                      //  YMM1[127:64]  -> YMM8[127:64]
//                                      // (YMM0[191:128] * YMM8[191:128]) -
//                                      //  YMM1[191:128] -> YMM8[191:128]
//                                      // (YMM0[255:192] * YMM8[255:192]) +
//                                      //  YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmaddsub213pdcomma ( VFMADDSUB213PD, )
//
// C prototype:
//  void dg_forthvfmaddsub213pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADDSUB213PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from target y, then subtracts the corresponding even indexed values or
//   adds the odd indexed values in the source from the result and then puts
//   the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADDSUB213PD,  // (XMM1[63:0]    * XMM0[63:0]) -
//                                      //  [RBX][63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) +
//                                      //  [RBX][127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMADDSUB213PD,     // (XMM1[63:0]    * XMM0[63:0]) -
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) +
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMADDSUB213PD, // (XMM1[63:0]   * XMM2[63:0]) -
//                                        //  XMM0[63:0]   -> XMM0[63:0]
//                                        // (XMM1[127:64] * XMM2[127:64]) +
//                                        //  XMM0[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMADDSUB213PD,      // (YMM1[63:0]    * YMM8[63:0]) -
//                                      //  YMM0[63:0]    -> YMM8[63:0]
//                                      // (YMM1[127:64]  * YMM8[127:64]) +
//                                      //  YMM0[127:64]  -> YMM8[127:64]
//                                      // (YMM1[191:128] * YMM8[191:128]) -
//                                      //  YMM0[191:128] -> YMM8[191:128]
//                                      // (YMM1[255:192] * YMM8[255:192]) +
//                                      //  YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmaddsub231pdcomma ( VFMADDSUB231PD, )
//
// C prototype:
//  void dg_forthvfmaddsub231pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADDSUB231PD instruction. This opcode sequence multiples each double
//   precision floating point value from the source with the corresponding
//   one from target y, then subtracts the corresponding even indexed values or 
//   adds the corresponding odd indexed values in the destination from the result 
//   and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADDSUB231PD,  // (XMM1[63:0]   * [RBX][63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * [RBX][127:64]) +
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMADDSUB231PD,     // (XMM1[63:0]    * XMM2[63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM2[127:64]) +
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMADDSUB231PD, // (XMM1[63:0]   * XMM0[63:0]) -
//                                        //  XMM2[63:0]   -> XMM0[63:0]
//                                        // (XMM1[127:64] * XMM0[127:64]) +
//                                        //  XMM2[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMADDSUB231PD,      // (YMM1[63:0]   * YMM0[63:0]) -
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) +
//                                      //  YMM8[127:64] -> YMM8[127:64]
//                                      // (YMM1[63:0]   * YMM0[63:0]) -
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) +
//                                      //  YMM8[127:64] -> YMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmaddsub132pscomma ( VFMADDSUB132PS, )
//
// C prototype:
//  void dg_forthvfmaddsub132pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADDSUB123PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from the source, then adds the corresponding odd indexed values or
//   subtracts the even indexed values in target y from the result, and then puts 
//   the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADDSUB123PS,  // ([RBX][31:0] * XMM0[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // ([RBX][63:32] * XMM0[63:32]) -
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // ([RBX][95:64] * XMM0[95:64]) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // ([RBX][127:96] * XMM0[127:96]) -
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMADDSUB123PS,     // (XMM2[31:0] * XMM0[31:0]) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (XMM2[63:32] * XMM0[63:32]) -
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (XMM2[95:64] * XMM0[95:64]) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (XMM2[127:96] * XMM0[127:96]) -
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VFMADDSUB123PS, // (XMM0[31:0] * XMM2[31:0]) +
//                                          //  XMM1[31:0] -> XMM0[31:0]
//                                          // (XMM0[63:32] * XMM2[63:32]) -
//                                          //  XMM1[63:32] -> XMM0[63:32]
//                                          // (XMM0[95:64] * XMM2[95:64]) +
//                                          //  XMM1[95:64] -> XMM0[95:64]
//                                          // (XMM0[127:96] * XMM2[127:96]) -
//                                          //  XMM1[127:96] -> XMM0[127:96]
//
//  YMM0  YMM1  YMM8  VFMADDSUB123PS,   // (YMM0[31:0] * YMM8[31:0]) +
//                                      //  YMM1[31:0] -> YMM8[31:0]
//                                      // (YMM0[63:32] * YMM8[63:32]) -
//                                      //  YMM1[63:32] -> YMM8[63:32]
//                                      // (YMM0[95:64] * YMM8[95:64]) +
//                                      //  YMM1[95:64] -> YMM8[95:64]
//                                      // (YMM0[127:96] * YMM8[127:96]) -
//                                      //  YMM1[127:96] -> YMM8[127:96]
//                                      // (YMM0[159:128] * YMM8[159:128]) +
//                                      //  YMM1[159:128] -> YMM8[159:128]
//                                      // (YMM0[191:160] * YMM8[191:160]) -
//                                      //  YMM1[191:160] -> YMM8[191:160]
//                                      // (YMM0[223:192] * YMM8[223:192]) +
//                                      //  YMM1[223:192] -> YMM8[223:192]
//                                      // (YMM0[255:224] * YMM8[255:224]) -
//                                      //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmaddsub213pscomma ( VFMADDSUB213PS, )
//
// C prototype:
//  void dg_forthvfmaddsub213pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADDSUB213PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from target y, then adds the corresponding odd indexed values or
//   subtracts the corresponding even indexed values in the source from the result, 
//   and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADDSUB213PS,  // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  [RBX][31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) -
//                                      //  [RBX][63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) +
//                                      //  [RBX][95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) -
//                                      //  [RBX][127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMADDSUB213PS,     // (XMM1[31:0] * XMM0[31:0]) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) -
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) +
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) -
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMADDSUB213PS, // (XMM1[31:0] * XMM2[31:0]) +
//                                        //  XMM0[31:0] -> XMM0[31:0]
//                                        // (XMM1[63:32] * XMM2[63:32]) -
//                                        //  XMM0[63:32] -> XMM0[63:32]
//                                        // (XMM1[95:64] * XMM2[95:64]) +
//                                        //  XMM0[95:64] -> XMM0[95:64]
//                                        // (XMM1[127:96] * XMM2[127:96]) -
//                                        //  XMM0[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMADDSUB213PS,      // (YMM1[31:0] * YMM8[31:0]) +
//                                      //  YMM0[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM8[63:32]) -
//                                      //  YMM0[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM8[95:64]) +
//                                      //  YMM0[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM8[127:96]) -
//                                      //  YMM0[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM8[159:128]) +
//                                      //  YMM0[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM8[191:160]) -
//                                      //  YMM0[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM8[223:192]) +
//                                      //  YMM0[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM8[255:224]) -
//                                      //  YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmaddsub231pscomma ( VFMADDSUB231PS, )
//
// C prototype:
//  void dg_forthvfmaddsub231pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMADDSUB231PS instruction. This opcode sequence multiples each single
//   precision floating point value from the source with the corresponding
//   one from target y, then adds the corresponding odd indexed values
//   or subtracts the even indexed values in the destination from the result, 
//   and then puts the results into the destination. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMADDSUB231PS,  // (XMM1[31:0] * [RBX][31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * [RBX][63:32]) -
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * [RBX][95:64]) +
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * [RBX][127:96]) -
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMADDSUB231PS,     // (XMM1[31:0] * XMM2[31:0]) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM2[63:32]) -
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM2[95:64]) +
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM2[127:96]) -
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMADDSUB231PS, // (XMM1[31:0] * XMM0[31:0]) +
//                                        //  XMM2[31:0] -> XMM0[31:0]
//                                        // (XMM1[63:32] * XMM0[63:32]) -   
//                                        //  XMM2[63:32] -> XMM0[63:32]
//                                        // (XMM1[95:64] * XMM0[95:64]) +
//                                        //  XMM2[95:64] -> XMM0[95:64]
//                                        // (XMM1[127:96] * XMM0[127:96]) -
//                                        //  XMM2[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMADDSUB231PS,      // (YMM1[31:0] * YMM0[31:0]) +
//                                      //  YMM8[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM0[63:32]) -
//                                      //  YMM8[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM0[95:64]) +
//                                      //  YMM8[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM0[127:96]) -
//                                      //  YMM8[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM0[159:128]) +
//                                      //  YMM8[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM0[191:160]) -
//                                      //  YMM8[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM0[223:192]) +
//                                      //  YMM8[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM0[255:224]) -
//                                      //  YMM8[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsubadd132pdcomma ( VFMSUBADD132PD, )
//
// C prototype:
//  void dg_forthvfmsubadd132pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUBADD123PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from the source, then subtracts the corresponding odd indexed values or
//   adds the corresponding even indexed values in target y from the result, 
//   then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUBADD123PD,  // ([RBX][63:0]   * XMM0[63:0]) +
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//                                      // ([RBX][127:64] * XMM0[127:64]) -
//                                      //  XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMSUBADD123PD,     // (XMM2[63:0]    * XMM0[63:0]) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (XMM2[127:64]  * XMM0[127:64]) -
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMSUBADD123PD, // (XMM0[63:0]   * XMM2[63:0]) +
//                                        //  XMM1[63:0]   -> XMM0[63:0]
//                                        // (XMM0[127:64] * XMM2[127:64]) -
//                                        //  XMM1[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMSUBADD123PD,      // (YMM0[63:0]    * YMM8[63:0]) +
//                                      //  YMM1[63:0]    -> YMM8[63:0]
//                                      // (YMM0[127:64]  * YMM8[127:64]) -
//                                      //  YMM1[127:64]  -> YMM8[127:64]
//                                      // (YMM0[191:128] * YMM8[191:128]) +
//                                      //  YMM1[191:128] -> YMM8[191:128]
//                                      // (YMM0[255:192] * YMM8[255:192]) -
//                                      //  YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsubadd213pdcomma ( VFMSUBADD213PD, )
//
// C prototype:
//  void dg_forthvfmsubadd213pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUBADD213PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from target y, then subtracts the corresponding odd indexed values or
//   adds the even indexed values in the source from the result and then puts
//   the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUBADD213PD,  // (XMM1[63:0]    * XMM0[63:0]) +
//                                      //  [RBX][63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) -
//                                      //  [RBX][127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMSUBADD213PD,     // (XMM1[63:0]    * XMM0[63:0]) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) -
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMSUBADD213PD, // (XMM1[63:0]   * XMM2[63:0]) +
//                                        //  XMM0[63:0]   -> XMM0[63:0]
//                                        // (XMM1[127:64] * XMM2[127:64]) -
//                                        //  XMM0[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMSUBADD213PD,      // (YMM1[63:0]    * YMM8[63:0]) +
//                                      //  YMM0[63:0]    -> YMM8[63:0]
//                                      // (YMM1[127:64]  * YMM8[127:64]) -
//                                      //  YMM0[127:64]  -> YMM8[127:64]
//                                      // (YMM1[191:128] * YMM8[191:128]) +
//                                      //  YMM0[191:128] -> YMM8[191:128]
//                                      // (YMM1[255:192] * YMM8[255:192]) -
//                                      //  YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsubadd231pdcomma ( VFMSUBADD231PD, )
//
// C prototype:
//  void dg_forthvfmsubadd231pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUBADD231PD instruction. This opcode sequence multiples each double
//   precision floating point value from the source with the corresponding
//   one from target y, then subtracts the corresponding odd indexed values or 
//   adds the corresponding even indexed values in the destination from the result 
//   and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUBADD231PD,  // (XMM1[63:0]   * [RBX][63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * [RBX][127:64]) -
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMSUBADD231PD,     // (XMM1[63:0]    * XMM2[63:0]) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM2[127:64]) -
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMSUBADD231PD, // (XMM1[63:0]   * XMM0[63:0]) +
//                                        //  XMM2[63:0]   -> XMM0[63:0]
//                                        // (XMM1[127:64] * XMM0[127:64]) -
//                                        //  XMM2[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMSUBADD231PD,      // (YMM1[63:0]   * YMM0[63:0]) +
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) -
//                                      //  YMM8[127:64] -> YMM8[127:64]
//                                      // (YMM1[63:0]   * YMM0[63:0]) +
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) -
//                                      //  YMM8[127:64] -> YMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsubadd132pscomma ( VFMSUBADD132PS, )
//
// C prototype:
//  void dg_forthvfmsubadd132pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUBADD132PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from the source, then adds the corresponding even indexed values or
//   subtracts the odd indexed values in target y from the result, and then puts 
//   the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUBADD132PS,  // ([RBX][31:0] * XMM0[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // ([RBX][63:32] * XMM0[63:32]) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // ([RBX][95:64] * XMM0[95:64]) -
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // ([RBX][127:96] * XMM0[127:96]) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMSUBADD132PS,     // (XMM2[31:0] * XMM0[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (XMM2[63:32] * XMM0[63:32]) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (XMM2[95:64] * XMM0[95:64]) -
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (XMM2[127:96] * XMM0[127:96]) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <-  XMM1  XMM0  VFMSUBADD132PS, // (XMM0[31:0] * XMM2[31:0]) -
//                                          //  XMM1[31:0] -> XMM0[31:0]
//                                          // (XMM0[63:32] * XMM2[63:32]) +
//                                          //  XMM1[63:32] -> XMM0[63:32]
//                                          // (XMM0[95:64] * XMM2[95:64]) -
//                                          //  XMM1[95:64] -> XMM0[95:64]
//                                          // (XMM0[127:96] * XMM2[127:96]) +
//                                          //  XMM1[127:96] -> XMM0[127:96]
//
//  YMM0  YMM1  YMM8  VFMSUBADD132PS,   // (YMM0[31:0] * YMM8[31:0]) -
//                                      //  YMM1[31:0] -> YMM8[31:0]
//                                      // (YMM0[63:32] * YMM8[63:32]) +
//                                      //  YMM1[63:32] -> YMM8[63:32]
//                                      // (YMM0[95:64] * YMM8[95:64]) -
//                                      //  YMM1[95:64] -> YMM8[95:64]
//                                      // (YMM0[127:96] * YMM8[127:96]) +
//                                      //  YMM1[127:96] -> YMM8[127:96]
//                                      // (YMM0[159:128] * YMM8[159:128]) -
//                                      //  YMM1[159:128] -> YMM8[159:128]
//                                      // (YMM0[191:160] * YMM8[191:160]) +
//                                      //  YMM1[191:160] -> YMM8[191:160]
//                                      // (YMM0[223:192] * YMM8[223:192]) -
//                                      //  YMM1[223:192] -> YMM8[223:192]
//                                      // (YMM0[255:224] * YMM8[255:224]) +
//                                      //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsubadd213pscomma ( VFMSUBADD213PS, )
//
// C prototype:
//  void dg_forthvfmsubadd213pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUBADD213PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from target y, then adds the corresponding even indexed values or
//   subtracts the corresponding odd indexed values in the source from the result, 
//   and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUBADD213PS,  // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  [RBX][31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) +
//                                      //  [RBX][63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) -
//                                      //  [RBX][95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) +
//                                      //  [RBX][127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMSUBADD213PS,     // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) +
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) -
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) +
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMSUBADD213PS, // (XMM1[31:0] * XMM2[31:0]) -
//                                        //  XMM0[31:0] -> XMM0[31:0]
//                                        // (XMM1[63:32] * XMM2[63:32]) +
//                                        //  XMM0[63:32] -> XMM0[63:32]
//                                        // (XMM1[95:64] * XMM2[95:64]) -
//                                        //  XMM0[95:64] -> XMM0[95:64]
//                                        // (XMM1[127:96] * XMM2[127:96]) +
//                                        //  XMM0[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMSUBADD213PS,      // (YMM1[31:0] * YMM8[31:0]) -
//                                      //  YMM0[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM8[63:32]) +
//                                      //  YMM0[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM8[95:64]) -
//                                      //  YMM0[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM8[127:96]) +
//                                      //  YMM0[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM8[159:128]) -
//                                      //  YMM0[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM8[191:160]) +
//                                      //  YMM0[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM8[223:192]) -
//                                      //  YMM0[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM8[255:224]) +
//                                      //  YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsubadd231pscomma ( VFMSUBADD231PS, )
//
// C prototype:
//  void dg_forthvfmsubadd231pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUBADD231PS instruction. This opcode sequence multiples each single
//   precision floating point value from the source with the corresponding
//   one from target y, then adds the corresponding even indexed values
//   or subtracts the odd indexed values in the destination from the result, 
//   and then puts the results into the destination. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUBADD231PS,  // (XMM1[31:0] * [RBX][31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * [RBX][63:32]) +
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * [RBX][95:64]) -
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * [RBX][127:96]) +
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMSUBADD231PS,     // (XMM1[31:0] * XMM2[31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM2[63:32]) +
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM2[95:64]) -
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM2[127:96]) +
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMSUBADD231PS, // (XMM1[31:0] * XMM0[31:0]) -
//                                        //  XMM2[31:0] -> XMM0[31:0]
//                                        // (XMM1[63:32] * XMM0[63:32]) +   
//                                        //  XMM2[63:32] -> XMM0[63:32]
//                                        // (XMM1[95:64] * XMM0[95:64]) -
//                                        //  XMM2[95:64] -> XMM0[95:64]
//                                        // (XMM1[127:96] * XMM0[127:96]) +
//                                        //  XMM2[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMSUBADD231PS,      // (YMM1[31:0] * YMM0[31:0]) -
//                                      //  YMM8[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM0[63:32]) +
//                                      //  YMM8[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM0[95:64]) -
//                                      //  YMM8[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM0[127:96]) +
//                                      //  YMM8[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM0[159:128]) -
//                                      //  YMM8[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM0[191:160]) +
//                                      //  YMM8[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM0[223:192]) -
//                                      //  YMM8[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM0[255:224]) +
//                                      //  YMM8[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub132pdcomma ( VFMSUB132PD, )
//
// C prototype:
//  void dg_forthvfmsub132pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB123PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from the source, then subtracts the  corresponding value in target y
//   from the result, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB123PD,     // ([RBX][63:0]   * XMM0[63:0]) -
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//                                      // ([RBX][127:64] * XMM0[127:64]) -
//                                      //  XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMSUB123PD,        // (XMM2[63:0]    * XMM0[63:0]) -
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (XMM2[127:64]  * XMM0[127:64]) -
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMSUB123PD,  // (XMM0[63:0]   * XMM2[63:0]) -
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (XMM0[127:64] * XMM2[127:64]) -
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMSUB123PD,         // (YMM0[63:0]    * YMM8[63:0]) -
//                                      //  YMM1[63:0]    -> YMM8[63:0]
//                                      // (YMM0[127:64]  * YMM8[127:64]) -
//                                      //  YMM1[127:64]  -> YMM8[127:64]
//                                      // (YMM0[191:128] * YMM8[191:128]) -
//                                      //  YMM1[191:128] -> YMM8[191:128]
//                                      // (YMM0[255:192] * YMM8[255:192]) -
//                                      //  YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub213pdcomma ( VFMSUB213PD, )
//
// C prototype:
//  void dg_forthvfmsub213pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB213PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from target y, then subtacts the corresponding value in the source from
//   the result, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB213PD,     // (XMM1[63:0]    * XMM0[63:0]) -
//                                      //  [RBX][63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) -
//                                      //  [RBX][127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMSUB213PD,        // (XMM1[63:0]    * XMM0[63:0]) -
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM0[127:64]) -
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMSUB213PD,  // (XMM1[63:0]   * XMM2[63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * XMM2[127:64]) -
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMSUB213PD,         // (YMM1[63:0]    * YMM8[63:0]) -
//                                      //  YMM0[63:0]    -> YMM8[63:0]
//                                      // (YMM1[127:64]  * YMM8[127:64]) -
//                                      //  YMM0[127:64]  -> YMM8[127:64]
//                                      // (YMM1[191:128] * YMM8[191:128]) -
//                                      //  YMM0[191:128] -> YMM8[191:128]
//                                      // (YMM1[255:192] * YMM8[255:192]) -
//                                      //  YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub231pdcomma ( VFMSUB231PD, )
//
// C prototype:
//  void dg_forthvfmsub231pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB231PD instruction. This opcode sequence multiples each double
//   precision floating point value from the source with the corresponding
//   one from target y, then subtracts the the corresponding value in the
//   destination from the result, and then puts the results into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB231PD,     // (XMM1[63:0]   * [RBX][63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * [RBX][127:64]) -
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFMSUB231PD,        // (XMM1[63:0]    * XMM2[63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64]  * XMM2[127:64]) -
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFMSUB231PD,  // (XMM1[63:0]   * XMM0[63:0]) -
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (XMM1[127:64] * XMM0[127:64]) -
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFMSUB231PD,         // (YMM1[63:0]   * YMM0[63:0]) -
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) -
//                                      //  YMM8[127:64] -> YMM8[127:64]
//                                      // (YMM1[63:0]   * YMM0[63:0]) -
//                                      //  YMM8[63:0]   -> YMM8[63:0]
//                                      // (YMM1[127:64] * YMM0[127:64]) -
//                                      //  YMM8[127:64] -> YMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub132pscomma ( VFMSUB132PS, )
//
// C prototype:
//  void dg_forthvfmsub132pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB132PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from the source, then subtracts the corresponding value in target y
//   from the result, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB132PS,     // ([RBX][31:0] * XMM0[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // ([RBX][63:32] * XMM0[63:32]) -
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // ([RBX][95:64] * XMM0[95:64]) -
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // ([RBX][127:96] * XMM0[127:96]) -
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMSUB132PS,        // (XMM2[31:0] * XMM0[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (XMM2[63:32] * XMM0[63:32]) -
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (XMM2[95:64] * XMM0[95:64]) -
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (XMM2[127:96] * XMM0[127:96]) -
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMSUB132PS,  // (XMM0[31:0] * XMM2[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (XMM0[63:32] * XMM2[63:32]) -
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (XMM0[95:64] * XMM2[95:64]) -
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (XMM0[127:96] * XMM2[127:96]) -
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMSUB132PS,         // (YMM0[31:0] * YMM8[31:0]) -
//                                      //  YMM1[31:0] -> YMM8[31:0]
//                                      // (YMM0[63:32] * YMM8[63:32]) -
//                                      //  YMM1[63:32] -> YMM8[63:32]
//                                      // (YMM0[95:64] * YMM8[95:64]) -
//                                      //  YMM1[95:64] -> YMM8[95:64]
//                                      // (YMM0[127:96] * YMM8[127:96]) -
//                                      //  YMM1[127:96] -> YMM8[127:96]
//                                      // (YMM0[159:128] * YMM8[159:128]) -
//                                      //  YMM1[159:128] -> YMM8[159:128]
//                                      // (YMM0[191:160] * YMM8[191:160]) -
//                                      //  YMM1[191:160] -> YMM8[191:160]
//                                      // (YMM0[223:192] * YMM8[223:192]) -
//                                      //  YMM1[223:192] -> YMM8[223:192]
//                                      // (YMM0[255:224] * YMM8[255:224]) -
//                                      //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub213pscomma ( VFMSUB213PS, )
//
// C prototype:
//  void dg_forthvfmsub213pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB213PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from target y, then subtracts the the corresponding value in the source
//   from the result, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB213PS,     // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  [RBX][31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) -
//                                      //  [RBX][63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) -
//                                      //  [RBX][95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) -
//                                      //  [RBX][127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMSUB213PS,        // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) -
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) -
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) -
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMSUB213PS,  // (XMM1[31:0] * XMM2[31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM2[63:32]) -
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM2[95:64]) -
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM2[127:96]) -
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMSUB213PS,         // (YMM1[31:0] * YMM8[31:0]) -
//                                      //  YMM0[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM8[63:32]) -
//                                      //  YMM0[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM8[95:64]) -
//                                      //  YMM0[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM8[127:96]) -
//                                      //  YMM0[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM8[159:128]) -
//                                      //  YMM0[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM8[191:160]) -
//                                      //  YMM0[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM8[223:192]) -
//                                      //  YMM0[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM8[255:224]) -
//                                      //  YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub231pscomma ( VFMSUB231PS, )
//
// C prototype:
//  void dg_forthvfmsub231pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB231PS instruction. This opcode sequence multiples each single
//   precision floating point value from the source with the corresponding
//   one from target y, then subtracts the corresponding value in the
//   destination from the result, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB231PS,     // (XMM1[31:0] * [RBX][31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * [RBX][63:32]) -
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * [RBX][95:64]) -
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * [RBX][127:96]) -
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFMSUB231PS,        // (XMM1[31:0] * XMM2[31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM2[63:32]) -
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM2[95:64]) -
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM2[127:96]) -
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFMSUB231PS,  // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (XMM1[63:32] * XMM0[63:32]) -
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (XMM1[95:64] * XMM0[95:64]) -
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (XMM1[127:96] * XMM0[127:96]) -
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFMSUB231PS,         // (YMM1[31:0] * YMM0[31:0]) -
//                                      //  YMM8[31:0] -> YMM8[31:0]
//                                      // (YMM1[63:32] * YMM0[63:32]) -
//                                      //  YMM8[63:32] -> YMM8[63:32]
//                                      // (YMM1[95:64] * YMM0[95:64]) -
//                                      //  YMM8[95:64] -> YMM8[95:64]
//                                      // (YMM1[127:96] * YMM0[127:96]) -
//                                      //  YMM8[127:96] -> YMM8[127:96]
//                                      // (YMM1[159:128] * YMM0[159:128]) -
//                                      //  YMM8[159:128] -> YMM8[159:128]
//                                      // (YMM1[191:160] * YMM0[191:160]) -
//                                      //  YMM8[191:160] -> YMM8[191:160]
//                                      // (YMM1[223:192] * YMM0[223:192]) -
//                                      //  YMM8[223:192] -> YMM8[223:192]
//                                      // (YMM1[255:224] * YMM0[255:224]) -
//                                      //  YMM8[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub132sdcomma ( VFMSUB132SD, )
//
// C prototype:
//  void dg_forthvfmsub132sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB132SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination 
//   with the corresponding one from the source, then subtracts the 
//   corresponding value in target y from the result, and then puts the result 
//   into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB132SD,     // ([RBX][63:0]   * XMM0[63:0]) -
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFMSUB132SD,        // (XMM2[63:0]    * XMM0[63:0]) -
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFMSUB132SD,  // (XMM0[63:0]   * XMM2[63:0]) -
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFMSUB132SD,         // (XMM0[63:0]    * XMM8[63:0]) -
//                                      //  XMM1[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub213sdcomma ( VFMSUB213SD, )
//
// C prototype:
//  void dg_forthvfmsub213sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB213SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination with 
//   the corresponding one from target y, then subtracts the corresponding 
//   value in the source from the result, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB213SD,     // (XMM1[63:0]    * XMM0[63:0]) -
//                                      //  [RBX][63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFMSUB213SD,        // (XMM1[63:0]    * XMM0[63:0]) -
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFMSUB213SD,  // (XMM1[63:0]   * XMM2[63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFMSUB213SD,         // (XMM1[63:0]    * XMM8[63:0]) -
//                                      //  XMM0[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub231sdcomma ( VFMSUB231SD, )
//
// C prototype:
//  void dg_forthvfmsub231sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB231SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the source with 
//   the corresponding one from target y, then subtracts the corresponding
//   value in the destination from the result, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB231SD,     // (XMM1[63:0]   * [RBX][63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFMSUB231SD,        // (XMM1[63:0]    * XMM2[63:0]) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFMSUB231SD,  // (XMM1[63:0]   * XMM0[63:0]) -
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFMSUB231SD,         // (XMM1[63:0]   * XMM0[63:0]) -
//                                      //  XMM8[63:0]   -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub132sscomma ( VFMSUB132SS, )
//
// C prototype:
//  void dg_forthvfmsub132sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB123SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from the source, then subtracts the  
//   corresponding value in target y from the result, and then puts the result 
//   into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB123SS,     // ([RBX][31:0] * XMM0[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFMSUB123SS,        // (XMM2[31:0] * XMM0[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFMSUB123SS,  // (XMM0[31:0] * XMM2[31:0]) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFMSUB123SS,         // (XMM0[31:0] * XMM8[31:0]) -
//                                      //  XMM1[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub213sscomma ( VFMSUB213SS, )
//
// C prototype:
//  void dg_forthvfmsub213sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB213SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from target y, then subtracts the corresponding
//   value in the source from the result, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB213SS,     // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  [RBX][31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFMSUB213SS,        // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFMSUB213SS,  // (XMM1[31:0] * XMM2[31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFMSUB213SS,         // (XMM1[31:0] * XMM8[31:0]) -
//                                      //  XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfmsub231sscomma ( VFMSUB231SS, )
//
// C prototype:
//  void dg_forthvfmsub231sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFMSUB231SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the source with 
//   the corresponding one from target y, then subtracts the corresponding 
//   value in the destination from the result, and then puts the results into  
//   the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFMSUB231SS,     // (XMM1[31:0] * [RBX][31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFMSUB231SS,        // (XMM1[31:0] * XMM2[31:0]) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFMSUB231SS,  // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFMSUB231SS,         // (XMM1[31:0] * XMM0[31:0]) -
//                                      //  XMM8[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd132pdcomma ( VFNMADD132PD, )
//
// C prototype:
//  void dg_forthvfnmadd132pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD123PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from the source, then negates the result and adds it to the 
//   corresponding value in target y, and then puts the results into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD123PD,    // (0 - ([RBX][63:0]   * XMM0[63:0])) +
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//                                      // (0 - ([RBX][127:64] * XMM0[127:64])) +
//                                      //  XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFNMADD123PD,       // (0 - (XMM2[63:0]    * XMM0[63:0])) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM2[127:64]  * XMM0[127:64])) +
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFNMADD123PD, // (0 - (XMM0[63:0]   * XMM2[63:0])) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM0[127:64] * XMM2[127:64])) +
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFNMADD123PD,        // (0 - (YMM0[63:0]    * YMM8[63:0])) +
//                                      //  YMM1[63:0]    -> YMM8[63:0]
//                                      // (0 - (YMM0[127:64]  * YMM8[127:64])) +
//                                      //  YMM1[127:64]  -> YMM8[127:64]
//                                      // (0 - (YMM0[191:128] * YMM8[191:128])) +
//                                      //  YMM1[191:128] -> YMM8[191:128]
//                                      // (0 - (YMM0[255:192] * YMM8[255:192])) +
//                                      //  YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd213pdcomma ( VFNMADD213PD, )
//
// C prototype:
//  void dg_forthvfnmadd213pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD213PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from target y, then negates the result and adds it to the corresponding 
//   value in the source, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD213PD,    // (0 - (XMM1[63:0]    * XMM0[63:0])) +
//                                      //  [RBX][63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM1[127:64]  * XMM0[127:64])) +
//                                      //  [RBX][127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFNMADD213PD,       // (0 - (XMM1[63:0]    * XMM0[63:0])) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM1[127:64]  * XMM0[127:64])) +
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFNMADD213PD, // (0 - (XMM1[63:0]   * XMM2[63:0])) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM1[127:64] * XMM2[127:64])) +
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFNMADD213PD,      // (0 - (YMM1[63:0]    * YMM8[63:0])) +
//                                    //  YMM0[63:0]    -> YMM8[63:0]
//                                    // (0 - (YMM1[127:64]  * YMM8[127:64])) +
//                                    //  YMM0[127:64]  -> YMM8[127:64]
//                                    // (0 - (YMM1[191:128] * YMM8[191:128])) +
//                                    //  YMM0[191:128] -> YMM8[191:128]
//                                    // (0 - (YMM1[255:192] * YMM8[255:192])) +
//                                    //  YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd231pdcomma ( VNFMADD231PD, )
//
// C prototype:
//  void dg_forthvfnmadd231pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD231PD instruction. This opcode sequence multiples each double
//   precision floating point value from the source with the corresponding
//   one from target y, then negates the result and adds it to the corresponding 
//   value in the destination, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD231PD,  // (0 - (XMM1[63:0]   * [RBX][63:0])) +
//                                    //  XMM0[63:0]   -> XMM0[63:0]
//                                    // (0 - (XMM1[127:64] * [RBX][127:64])) +
//                                    //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFNMADD231PD,      // (0 - (XMM1[63:0]    * XMM2[63:0])) +
//                                     //  XMM0[63:0]   -> XMM0[63:0]
//                                     // (0 - (XMM1[127:64]  * XMM2[127:64])) +
//                                     //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFNMADD231PD, // (0 - (XMM1[63:0]   * XMM0[63:0])) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM1[127:64] * XMM0[127:64])) +
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFNMADD231PD,      // (0 - (YMM1[63:0]   * YMM0[63:0])) +
//                                    //  YMM8[63:0]   -> YMM8[63:0]
//                                    // (0 - (YMM1[127:64] * YMM0[127:64])) +
//                                    //  YMM8[127:64] -> YMM8[127:64]
//                                    // (0 - (YMM1[63:0]   * YMM0[63:0])) +
//                                    //  YMM8[63:0]   -> YMM8[63:0]
//                                    // (0 - (YMM1[127:64] * YMM0[127:64])) +
//                                    //  YMM8[127:64] -> YMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd132pscomma ( VNFMADD132PS, )
//
// C prototype:
//  void dg_forthvfnmadd132pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD123PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from the source, then negates the result and adds it to the 
//   corresponding value in target y, and then puts the results into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD123PS,    // (0 - ([RBX][31:0] * XMM0[31:0])) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (0 - ([RBX][63:32] * XMM0[63:32])) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (0 - ([RBX][95:64] * XMM0[95:64])) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (0 - ([RBX][127:96] * XMM0[127:96])) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFNMADD123PS,       // (0 - (XMM2[31:0] * XMM0[31:0])) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM2[63:32] * XMM0[63:32])) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM2[95:64] * XMM0[95:64])) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM2[127:96] * XMM0[127:96])) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFNMADD123PS, // (0 - (XMM0[31:0] * XMM2[31:0])) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM0[63:32] * XMM2[63:32])) +
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM0[95:64] * XMM2[95:64])) +
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM0[127:96] * XMM2[127:96])) +
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFNMADD123PS,        // (0 - (YMM0[31:0] * YMM8[31:0])) +
//                                      //  YMM1[31:0] -> YMM8[31:0]
//                                      // (0 - (YMM0[63:32] * YMM8[63:32])) +
//                                      //  YMM1[63:32] -> YMM8[63:32]
//                                      // (0 - (YMM0[95:64] * YMM8[95:64])) +
//                                      //  YMM1[95:64] -> YMM8[95:64]
//                                      // (0 - (YMM0[127:96] * YMM8[127:96])) +
//                                      //  YMM1[127:96] -> YMM8[127:96]
//                                      // (0 - (YMM0[159:128] * YMM8[159:128])) +
//                                      //  YMM1[159:128] -> YMM8[159:128]
//                                      // (0 - (YMM0[191:160] * YMM8[191:160])) +
//                                      //  YMM1[191:160] -> YMM8[191:160]
//                                      // (0 - (YMM0[223:192] * YMM8[223:192])) +
//                                      //  YMM1[223:192] -> YMM8[223:192]
//                                      // (0 - (YMM0[255:224] * YMM8[255:224])) +
//                                      //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd213pscomma ( VFNMADD213PS, )
//
// C prototype:
//  void dg_forthvfnmadd213pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD213PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from target y, then negates the result and adds it to the corresponding 
//   value in the source, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD213PS,    // (0 - (XMM1[31:0] * XMM0[31:0])) +
//                                      //  [RBX][31:0] -> XMM0[31:0]
//                                      // (0 - (XMM1[63:32] * XMM0[63:32])) +
//                                      //  [RBX][63:32] -> XMM0[63:32]
//                                      // (0 - (XMM1[95:64] * XMM0[95:64])) +
//                                      //  [RBX][95:64] -> XMM0[95:64]
//                                      // (0 - (XMM1[127:96] * XMM0[127:96])) +
//                                      //  [RBX][127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFNMADD213PS,       // (0 - (XMM1[31:0] * XMM0[31:0])) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM1[63:32] * XMM0[63:32])) +
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM1[95:64] * XMM0[95:64])) +
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM1[127:96] * XMM0[127:96])) +
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFNMADD213PS, // (0 - (XMM1[31:0] * XMM2[31:0])) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM1[63:32] * XMM2[63:32])) +
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM1[95:64] * XMM2[95:64])) +
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM1[127:96] * XMM2[127:96])) +
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFNMADD213PS,     // (0 - (YMM1[31:0] * YMM8[31:0])) +
//                                   //  YMM0[31:0] -> YMM8[31:0]
//                                   // (0 - (YMM1[63:32] * YMM8[63:32])) +
//                                   //  YMM0[63:32] -> YMM8[63:32]
//                                   // (0 - (YMM1[95:64] * YMM8[95:64])) +
//                                   //  YMM0[95:64] -> YMM8[95:64]
//                                   // (0 - (YMM1[127:96] * YMM8[127:96])) +
//                                   //  YMM0[127:96] -> YMM8[127:96]
//                                   // (0 - (YMM1[159:128] * YMM8[159:128])) +
//                                   //  YMM0[159:128] -> YMM8[159:128]
//                                   // (0 - (YMM1[191:160] * YMM8[191:160])) +
//                                   //  YMM0[191:160] -> YMM8[191:160]
//                                   // (0 - (YMM1[223:192] * YMM8[223:192])) +
//                                   //  YMM0[223:192] -> YMM8[223:192]
//                                   // (0 - (YMM1[255:224] * YMM8[255:224])) +
//                                   //  YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd231pscomma ( VFNMADD231PS, )
//
// C prototype:
//  void dg_forthvfnmadd231pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD231PS instruction. This opcode sequence multiples each single
//   precision floating point value from the source with the corresponding
//   one from target y, then negates the result and adds it to the corresponding 
//   value in the destination, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD231PS,  // (0 - (XMM1[31:0] * [RBX][31:0])) +
//                                    //  XMM0[31:0] -> XMM0[31:0]
//                                    // (0 - (XMM1[63:32] * [RBX][63:32])) +
//                                    //  XMM0[63:32] -> XMM0[63:32]
//                                    // (0 - (XMM1[95:64] * [RBX][95:64])) +
//                                    //  XMM0[95:64] -> XMM0[95:64]
//                                    // (0 - (XMM1[127:96] * [RBX][127:96])) +
//                                    //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFNMADD231PS,     // (0 - (XMM1[31:0] * XMM2[31:0])) +
//                                    //  XMM0[31:0] -> XMM0[31:0]
//                                    // (0 - (XMM1[63:32] * XMM2[63:32])) +
//                                    //  XMM0[63:32] -> XMM0[63:32]
//                                    // (0 - (XMM1[95:64] * XMM2[95:64])) +
//                                    //  XMM0[95:64] -> XMM0[95:64]
//                                    // (0 - (XMM1[127:96] * XMM2[127:96])) +
//                                    //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFNMADD231PS, // (0 - (XMM1[31:0] * XMM0[31:0])) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM1[63:32] * XMM0[63:32])) +
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM1[95:64] * XMM0[95:64])) +
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM1[127:96] * XMM0[127:96])) +
//                                      //  XMM2[127:96] -> XMM0[127:96]
// 
//  YMM0 YMM1 YMM8 VFNMADD231PS,    // (0 - (YMM1[31:0] * YMM0[31:0])) +
//                                  //  YMM8[31:0] -> YMM8[31:0]
//                                  // (0 - (YMM1[63:32] * YMM0[63:32])) +
//                                  //  YMM8[63:32] -> YMM8[63:32]
//                                  // (0 - (YMM1[95:64] * YMM0[95:64])) +
//                                  //  YMM8[95:64] -> YMM8[95:64]
//                                  // (0 - (YMM1[127:96] * YMM0[127:96])) +
//                                  //  YMM8[127:96] -> YMM8[127:96]
//                                  // (0 - (YMM1[159:128] * YMM0[159:128])) +
//                                  //  YMM8[159:128] -> YMM8[159:128]
//                                  // (0 - (YMM1[191:160] * YMM0[191:160])) +
//                                  //  YMM8[191:160] -> YMM8[191:160]
//                                  // (0 - (YMM1[223:192] * YMM0[223:192])) +
//                                  //  YMM8[223:192] -> YMM8[223:192]
//                                  // (0 - (YMM1[255:224] * YMM0[255:224])) +
//                                  //  YMM8[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd132sdcomma ( VFNMADD132SD, )
//
// C prototype:
//  void dg_forthvfnmadd132sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD123SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination 
//   with the corresponding one from the source, then negates the result and
//   adds it to the corresponding value in target y, and then puts the result 
//   into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD123SD,    // (0 - ([RBX][63:0]   * XMM0[63:0])) +
//                                      //  XMM1[63:0]    -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFNMADD123SD,       // (0 - (XMM2[63:0]    * XMM0[63:0])) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFNMADD123SD, // (0 - (XMM0[63:0]   * XMM2[63:0])) +
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFNMADD123SD,        // (0 - (XMM0[63:0]    * XMM8[63:0])) +
//                                      //  XMM1[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd213sdcomma ( VFNMADD213SD, )
//
// C prototype:
//  void dg_forthvfnmadd213sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD213SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination with 
//   the corresponding one from target y, then negates the result and adds it to  
//   the corresponding value in the source, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD213SD,   // (0 - (XMM1[63:0]    * XMM0[63:0])) +
//                                     //  [RBX][63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFNMADD213SD,      // (0 - (XMM1[63:0]    * XMM0[63:0])) +
//                                     //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFNMADD213SD, // (0 - (XMM1[63:0]   * XMM2[63:0])) +
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFNMADD213SD,       // (0 - (XMM1[63:0]    * XMM8[63:0])) +
//                                     //  XMM0[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd231sdcomma ( VFNMADD231SD, )
//
// C prototype:
//  void dg_forthvfnmadd231sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD231SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the source with 
//   the corresponding one from target y, then negates the result and adds it 
//   to the  corresponding value in the destination, and then puts the result  
//   into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD231SD,  // (0 - (XMM1[63:0]   * [RBX][63:0])) +
//                                    //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFNMADD231SD,     // (0 - (XMM1[63:0]    * XMM2[63:0])) +
//                                    //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFNMADD231SD, // (0 - (XMM1[63:0]   * XMM0[63:0])) +
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFNMADD231SD,        // (0 - (XMM1[63:0]   * XMM0[63:0])) +
//                                      //  XMM8[63:0]   -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd132sscomma ( VFNMADD132SS, )
//
// C prototype:
//  void dg_forthvfnmadd132sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD123SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from the source, then negates the result and
//   adds it to the corresponding value in target y, and then puts the result 
//   into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD123SS,    // (0 - ([RBX][31:0] * XMM0[31:0])) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFNMADD123SS,       // (0 - (XMM2[31:0] * XMM0[31:0])) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFNMADD123SS, // (0 - (XMM0[31:0] * XMM2[31:0])) +
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFNMADD123SS,        // (0 - (XMM0[31:0] * XMM8[31:0])) +
//                                      //  XMM1[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd213sscomma ( VFNMADD213SS, )
//
// C prototype:
//  void dg_forthvfnmadd213sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD213SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from target y, then negates the result and adds it
//   to the corresponding value in the source, and then puts the result into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD213SS,    // (0 - (XMM1[31:0] * XMM0[31:0])) +
//                                      //  [RBX][31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFNMADD213SS,       // (0 - (XMM1[31:0] * XMM0[31:0])) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFNMADD213SS, // (0 - (XMM1[31:0] * XMM2[31:0])) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFNMADD213SS,        // (0 - (XMM1[31:0] * XMM8[31:0])) +
//                                      //  XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmadd231sscomma ( VFNMADD231SS, )
//
// C prototype:
//  void dg_forthvfnmadd231sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMADD231SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the source with 
//   the corresponding one from target y, then negates the result and adds it to  
//   the corresponding value in the destination, and then puts the results into 
//   the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMADD231SS,    // (0 - (XMM1[31:0] * [RBX][31:0])) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFNMADD231SS,       // (0 - (XMM1[31:0] * XMM2[31:0])) +
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFNMADD231SS, // (0 - (XMM1[31:0] * XMM0[31:0])) +
//                                      //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFNMADD231SS,        // (0 - (XMM1[31:0] * XMM0[31:0])) +
//                                      //  XMM8[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub132pdcomma ( VFNMSUB132PD, )
//
// C prototype:
//  void dg_forthvfnmsub132pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB123PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from the source, then subtracts the corresponding value in target y
//   from the negation of the result, and then puts the results into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB123PD,  // (0 - ([RBX][63:0]   * XMM0[63:0])) -
//                                    //  XMM1[63:0]    -> XMM0[63:0]
//                                    // (0 - ([RBX][127:64] * XMM0[127:64])) -
//                                    //  XMM1[127:64]  -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFNMSUB123PD,    // (0 - (XMM2[63:0]    * XMM0[63:0])) -
//                                   //  XMM1[63:0]   -> XMM0[63:0]
//                                   // (0 - (XMM2[127:64]  * XMM0[127:64])) -
//                                   //  XMM1[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB123PD, // (0 - (XMM0[63:0]   * XMM2[63:0])) -
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM0[127:64] * XMM2[127:64])) -
//                                      //  XMM1[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFNMSUB123PD,    // (0 - (YMM0[63:0]    * YMM8[63:0])) -
//                                  //  YMM1[63:0]    -> YMM8[63:0]
//                                  // (0 - (YMM0[127:64]  * YMM8[127:64])) -
//                                  //  YMM1[127:64]  -> YMM8[127:64]
//                                  // (0 - (YMM0[191:128] * YMM8[191:128])) -
//                                  //  YMM1[191:128] -> YMM8[191:128]
//                                  // (0 - (YMM0[255:192] * YMM8[255:192])) -
//                                  //  YMM1[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub213pdcomma ( VFNMSUB213PD, )
//
// C prototype:
//  void dg_forthvfnmsub213pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB213PD instruction. This opcode sequence multiples each double
//   precision floating point value from the destination with the corresponding
//   one from target y, then subtacts the corresponding value in the source from
//   the negation of the result, and then puts the results into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB213PD,  // (0 - (XMM1[63:0]    * XMM0[63:0])) -
//                                    //  [RBX][63:0]   -> XMM0[63:0]
//                                    // (0 - (XMM1[127:64]  * XMM0[127:64])) -
//                                    //  [RBX][127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFNMSUB213PD,     // (0 - (XMM1[63:0]    * XMM0[63:0])) -
//                                    //  XMM2[63:0]   -> XMM0[63:0]
//                                    // (0 - (XMM1[127:64]  * XMM0[127:64])) -
//                                    //  XMM2[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB213PD, // (0 - (XMM1[63:0]   * XMM2[63:0])) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM1[127:64] * XMM2[127:64])) -
//                                      //  XMM0[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFNMSUB213PD,    // (0 - (YMM1[63:0]    * YMM8[63:0])) -
//                                  //  YMM0[63:0]    -> YMM8[63:0]
//                                  // (0 - (YMM1[127:64]  * YMM8[127:64])) -
//                                  //  YMM0[127:64]  -> YMM8[127:64]
//                                  // (0 - (YMM1[191:128] * YMM8[191:128])) -
//                                  //  YMM0[191:128] -> YMM8[191:128]
//                                  // (0 - (YMM1[255:192] * YMM8[255:192])) -
//                                  //  YMM0[255:192] -> YMM8[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub231pdcomma ( VFNMSUB231PD, )
//
// C prototype:
//  void dg_forthvfnmsub231pdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB231PD instruction. This opcode sequence multiples each double
//   precision floating point value from the source with the corresponding
//   one from target y, then subtracts the the corresponding value in the
//   destination from the negation of the result, and then puts the results into  
//   the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB231PD,  // (0 - (XMM1[63:0]   * [RBX][63:0])) -
//                                    //  XMM0[63:0]   -> XMM0[63:0]
//                                    // (0 - (XMM1[127:64] * [RBX][127:64])) -
//                                    //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 XMM1 XMM0  VFNMSUB231PD,     // (0 - (XMM1[63:0]    * XMM2[63:0])) -
//                                    //  XMM0[63:0]   -> XMM0[63:0]
//                                    // (0 - (XMM1[127:64]  * XMM2[127:64])) -
//                                    //  XMM0[127:64] -> XMM0[127:64]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB231PD, // (0 - (XMM1[63:0]   * XMM0[63:0])) -
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//                                      // (0 - (XMM1[127:64] * XMM0[127:64])) -
//                                      //  XMM2[127:64] -> XMM0[127:64]
//
//  YMM0 YMM1 YMM8 VFNMSUB231PD,     // (0 - (YMM1[63:0]   * YMM0[63:0])) -
//                                   //  YMM8[63:0]   -> YMM8[63:0]
//                                   // (0 - (YMM1[127:64] * YMM0[127:64])) -
//                                   //  YMM8[127:64] -> YMM8[127:64]
//                                   // (0 - (YMM1[63:0]   * YMM0[63:0])) -
//                                   //  YMM8[63:0]   -> YMM8[63:0]
//                                   // (0 - (YMM1[127:64] * YMM0[127:64])) -
//                                   //  YMM8[127:64] -> YMM8[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub132pscomma ( VFNMSUB132PS, )
//
// C prototype:
//  void dg_forthvfnmsub132pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB132PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from the source, then subtracts the corresponding value in target y
//   from the negation of the result, and then puts the results into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB132PS,  // (0 - ([RBX][31:0] * XMM0[31:0])) -
//                                    //  XMM1[31:0] -> XMM0[31:0]
//                                    // (0 - ([RBX][63:32] * XMM0[63:32])) -
//                                    //  XMM1[63:32] -> XMM0[63:32]
//                                    // (0 - ([RBX][95:64] * XMM0[95:64])) -
//                                    //  XMM1[95:64] -> XMM0[95:64]
//                                    // (0 - ([RBX][127:96] * XMM0[127:96])) -
//                                    //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFNMSUB132PS,    // (0 - (XMM2[31:0] * XMM0[31:0])) -
//                                   //  XMM1[31:0] -> XMM0[31:0]
//                                   // (0 - (XMM2[63:32] * XMM0[63:32])) -
//                                   //  XMM1[63:32] -> XMM0[63:32]
//                                   // (0 - (XMM2[95:64] * XMM0[95:64])) -
//                                   //  XMM1[95:64] -> XMM0[95:64]
//                                   // (0 - (XMM2[127:96] * XMM0[127:96])) -
//                                   //  XMM1[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB132PS, // (0 - (XMM0[31:0] * XMM2[31:0])) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM0[63:32] * XMM2[63:32])) -
//                                      //  XMM1[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM0[95:64] * XMM2[95:64])) -
//                                      //  XMM1[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM0[127:96] * XMM2[127:96])) -
//                                      //  XMM1[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFNMSUB132PS,    // (0 - (YMM0[31:0] * YMM8[31:0])) -
//                                  //  YMM1[31:0] -> YMM8[31:0]
//                                  // (0 - (YMM0[63:32] * YMM8[63:32])) -
//                                  //  YMM1[63:32] -> YMM8[63:32]
//                                  // (0 - (YMM0[95:64] * YMM8[95:64])) -
//                                  //  YMM1[95:64] -> YMM8[95:64]
//                                  // (0 - (YMM0[127:96] * YMM8[127:96])) -
//                                  //  YMM1[127:96] -> YMM8[127:96]
//                                  // (0 - (YMM0[159:128] * YMM8[159:128])) -
//                                  //  YMM1[159:128] -> YMM8[159:128]
//                                  // (0 - (YMM0[191:160] * YMM8[191:160])) -
//                                  //  YMM1[191:160] -> YMM8[191:160]
//                                  // (0 - (YMM0[223:192] * YMM8[223:192])) -
//                                  //  YMM1[223:192] -> YMM8[223:192]
//                                  // (0 - (YMM0[255:224] * YMM8[255:224])) -
//                                  //  YMM1[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub213pscomma ( VFNMSUB213PS, )
//
// C prototype:
//  void dg_forthvfnmsub213pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB213PS instruction. This opcode sequence multiples each single
//   precision floating point value from the destination with the corresponding
//   one from target y, then subtracts the the corresponding value in the source
//   from the negation of the result, and then puts the results into the 
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB213PS,  // (0 - (XMM1[31:0] * XMM0[31:0])) -
//                                    //  [RBX][31:0] -> XMM0[31:0]
//                                    // (0 - (XMM1[63:32] * XMM0[63:32])) -
//                                    //  [RBX][63:32] -> XMM0[63:32]
//                                    // (0 - (XMM1[95:64] * XMM0[95:64])) -
//                                    //  [RBX][95:64] -> XMM0[95:64]
//                                    // (0 - (XMM1[127:96] * XMM0[127:96])) -
//                                    //  [RBX][127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFNMSUB213PS,     // (0 - (XMM1[31:0] * XMM0[31:0])) -
//                                    //  XMM2[31:0] -> XMM0[31:0]
//                                    // (0 - (XMM1[63:32] * XMM0[63:32])) -
//                                    //  XMM2[63:32] -> XMM0[63:32]
//                                    // (0 - (XMM1[95:64] * XMM0[95:64])) -
//                                    //  XMM2[95:64] -> XMM0[95:64]
//                                    // (0 - (XMM1[127:96] * XMM0[127:96])) -
//                                    //  XMM2[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB213PS, // (0 - (XMM1[31:0] * XMM2[31:0])) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM1[63:32] * XMM2[63:32])) -
//                                      //  XMM0[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM1[95:64] * XMM2[95:64])) -
//                                      //  XMM0[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM1[127:96] * XMM2[127:96])) -
//                                      //  XMM0[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFNMSUB213PS,    // (0 - (YMM1[31:0] * YMM8[31:0])) -
//                                  //  YMM0[31:0] -> YMM8[31:0]
//                                  // (0 - (YMM1[63:32] * YMM8[63:32])) -
//                                  //  YMM0[63:32] -> YMM8[63:32]
//                                  // (0 - (YMM1[95:64] * YMM8[95:64])) -
//                                  //  YMM0[95:64] -> YMM8[95:64]
//                                  // (0 - (YMM1[127:96] * YMM8[127:96])) -
//                                  //  YMM0[127:96] -> YMM8[127:96]
//                                  // (0 - (YMM1[159:128] * YMM8[159:128])) -
//                                  //  YMM0[159:128] -> YMM8[159:128]
//                                  // (0 - (YMM1[191:160] * YMM8[191:160])) -
//                                  //  YMM0[191:160] -> YMM8[191:160]
//                                  // (0 - (YMM1[223:192] * YMM8[223:192])) -
//                                  //  YMM0[223:192] -> YMM8[223:192]
//                                  // (0 - (YMM1[255:224] * YMM8[255:224])) -
//                                  //  YMM0[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub231pscomma ( VFNMSUB231PS, )
//
// C prototype:
//  void dg_forthvfnmsub231pscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB231PS instruction. This opcode sequence multiples each single
//   precision floating point value from the source with the corresponding
//   one from target y, then subtracts the corresponding value in the
//   destination from the negation of the result, and then puts the results into 
//   the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB231PS,  // (0 - (XMM1[31:0] * [RBX][31:0])) -
//                                    //  XMM0[31:0] -> XMM0[31:0]
//                                    // (0 - (XMM1[63:32] * [RBX][63:32])) -
//                                    //  XMM0[63:32] -> XMM0[63:32]
//                                    // (0 - (XMM1[95:64] * [RBX][95:64])) -
//                                    //  XMM0[95:64] -> XMM0[95:64]
//                                    // (0 - (XMM1[127:96] * [RBX][127:96])) -
//                                    //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 XMM1 XMM0  VFNMSUB231PS,   // (0 - (XMM1[31:0] * XMM2[31:0])) -
//                                  //  XMM0[31:0] -> XMM0[31:0]
//                                  // (0 - (XMM1[63:32] * XMM2[63:32])) -
//                                  //  XMM0[63:32] -> XMM0[63:32]
//                                  // (0 - (XMM1[95:64] * XMM2[95:64])) -
//                                  //  XMM0[95:64] -> XMM0[95:64]
//                                  // (0 - (XMM1[127:96] * XMM2[127:96])) -
//                                  //  XMM0[127:96] -> XMM0[127:96]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB231PS, // (0 - (XMM1[31:0] * XMM0[31:0])) -
//                                      //  XMM2[31:0] -> XMM0[31:0]
//                                      // (0 - (XMM1[63:32] * XMM0[63:32])) -
//                                      //  XMM2[63:32] -> XMM0[63:32]
//                                      // (0 - (XMM1[95:64] * XMM0[95:64])) -
//                                      //  XMM2[95:64] -> XMM0[95:64]
//                                      // (0 - (XMM1[127:96] * XMM0[127:96])) -
//                                      //  XMM2[127:96] -> XMM0[127:96]
//
//  YMM0 YMM1 YMM8 VFNMSUB231PS,    // (0 - (YMM1[31:0] * YMM0[31:0])) -
//                                  //  YMM8[31:0] -> YMM8[31:0]
//                                  // (0 - (YMM1[63:32] * YMM0[63:32])) -
//                                  //  YMM8[63:32] -> YMM8[63:32]
//                                  // (0 - (YMM1[95:64] * YMM0[95:64])) -
//                                  //  YMM8[95:64] -> YMM8[95:64]
//                                  // (0 - (YMM1[127:96] * YMM0[127:96])) -
//                                  //  YMM8[127:96] -> YMM8[127:96]
//                                  // (0 - (YMM1[159:128] * YMM0[159:128])) -
//                                  //  YMM8[159:128] -> YMM8[159:128]
//                                  // (0 - (YMM1[191:160] * YMM0[191:160])) -
//                                  //  YMM8[191:160] -> YMM8[191:160]
//                                  // (0 - (YMM1[223:192] * YMM0[223:192])) -
//                                  //  YMM8[223:192] -> YMM8[223:192]
//                                  // (0 - (YMM1[255:224] * YMM0[255:224])) -
//                                  //  YMM8[255:224] -> YMM8[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub132sdcomma ( VFNMSUB132SD, )
//
// C prototype:
//  void dg_forthvfnmsub132sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB132SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination 
//   with the corresponding one from the source, then subtracts the 
//   corresponding value in target y from the negation of the result, and then  
//   puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB132SD,  // (0 - ([RBX][63:0]   * XMM0[63:0])) -
//                                    //  XMM1[63:0]    -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFNMSUB132SD,     // (0 - (XMM2[63:0]    * XMM0[63:0])) -
//                                    //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB132SD, // (0 - (XMM0[63:0]   * XMM2[63:0])) -
//                                      //  XMM1[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFNMSUB132SD,    // (0 - (XMM0[63:0]    * XMM8[63:0])) -
//                                  //  XMM1[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub213sdcomma ( VFNMSUB213SD, )
//
// C prototype:
//  void dg_forthvfnmsub213sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB213SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the destination with 
//   the corresponding one from target y, then subtracts the corresponding 
//   value in the source from the negation of the result, and then puts the result 
//   into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB213SD,  // (0 - (XMM1[63:0]    * XMM0[63:0])) -
//                                    //  [RBX][63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFNMSUB213SD,     // (0 - (XMM1[63:0]    * XMM0[63:0])) -
//                                    //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB213SD, // (0 - (XMM1[63:0]   * XMM2[63:0])) -
//                                      //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFNMSUB213SD,    // (0 - (XMM1[63:0]    * XMM8[63:0])) -
//                                  //  XMM0[63:0]    -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub231sdcomma ( VFNMSUB231SD, )
//
// C prototype:
//  void dg_forthvfnmsub231sdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifier:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB231SD instruction. This opcode sequence multiples the double
//   precision floating point value from the lower 64 bits of the source with 
//   the corresponding one from target y, then subtracts the corresponding
//   value in the destination from the negation of the result, and then puts the 
//   result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB231SD,  // (0 - (XMM1[63:0]   * [RBX][63:0])) -
//                                    //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 XMM1 XMM0  VFNMSUB231SD,     // (0 - (XMM1[63:0]    * XMM2[63:0])) -
//                                    //  XMM0[63:0]   -> XMM0[63:0]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB231SD, // (0 - (XMM1[63:0]   * XMM0[63:0])) -
//                                      //  XMM2[63:0]   -> XMM0[63:0]
//
//  XMM0 XMM1 XMM8 VFNMSUB231SD,    // (0 - (XMM1[63:0]   * XMM0[63:0])) -
//                                  //  XMM8[63:0]   -> XMM8[63:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub132sscomma ( VFNMSUB132SS, )
//
// C prototype:
//  void dg_forthvfnmsub132sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB123SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from the source, then subtracts the  
//   corresponding value in target y from the negation of the result, and then 
//   puts the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB123SS,  // (0 - ([RBX][31:0] * XMM0[31:0])) -
//                                    //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFNMSUB123SS,   // (0 - (XMM2[31:0] * XMM0[31:0])) -
//                                  //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB123SS, // (0 - (XMM0[31:0] * XMM2[31:0])) -
//                                      //  XMM1[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFNMSUB123SS,    // (0 - (XMM0[31:0] * XMM8[31:0])) -
//                                  //  XMM1[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub213sscomma ( VFNMSUB213SS, )
//
// C prototype:
//  void dg_forthvfnmsub213sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB213SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the destination 
//   with the corresponding one from target y, then subtracts the corresponding
//   value in the source from the negation of the result, and then puts the  
//   result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB213SS,  // (0 - (XMM1[31:0] * XMM0[31:0])) -
//                                    //  [RBX][31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFNMSUB213SS,   // (0 - (XMM1[31:0] * XMM0[31:0])) -
//                                  //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB213SS, // (0 - (XMM1[31:0] * XMM2[31:0])) -
//                                      //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFNMSUB213SS,     // (0 - (XMM1[31:0] * XMM8[31:0])) -
//                                   //  XMM0[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvfnmsub231sscomma ( VFNMSUB231SS, )
//
// C prototype:
//  void dg_forthvfnmsub231sscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VFNMSUB231SS instruction. This opcode sequence multiples the single
//   precision floating point value from the lower 32 bits of the source with 
//   the corresponding one from target y, then subtracts the corresponding 
//   value in the destination from the negation of the result, and then puts   
//   the result into the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R] XMM1 XMM0  VFNMSUB231SS,  // (0 - (XMM1[31:0] * [RBX][31:0])) -
//                                    //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 XMM1 XMM0  VFNMSUB231SS,    // (0 - (XMM1[31:0] * XMM2[31:0])) -
//                                   //  XMM0[31:0] -> XMM0[31:0]
//
//  XMM2 <- XMM1 XMM0  VFNMSUB231SS, // (0 - (XMM1[31:0] * XMM0[31:0])) -
//                                      //  XMM2[31:0] -> XMM0[31:0]
//
//  XMM0 XMM1 XMM8 VFNMSUB231SS,    // (0 - (XMM1[31:0] * XMM0[31:0])) -
//                                  //  XMM8[31:0] -> XMM8[31:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvperm2f128comma ( VPERM2F128, )
//
// C prototype:
//  void dg_forthvperm2f128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
// 
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPERM2F128 instruction. This opcode sequence uses the value of
//   target w to determine the sources for each 128 bit section of the
//   destination.
//
//  target w bits 3, 1, 0    source for dest[127:0]
//                0  0  0     target y[127:0]
//                0  0  1     target y[255:128]         
//                0  0  0     source[127:0]
//                0  0  1     source[255:128]
//                1  x  x     0                // destination[127:0] is cleared  
//
//  target w bits 7, 5, 4    source for dest[255:128]
//                0  0  0     target y[127:0]
//                0  0  1     target y[255:128]         
//                0  0  0     source[127:0]
//                0  0  1     source[255:128]
//                1  x  x     0                // destination[255:128] is cleared  
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  RBX [R]  YMM1  YMM0  VPERM2F128,  // YMM1 -> YMM0
//
//  32 N  RBX [R]  YMM1  YMM0  VPERM2F128,  // [RBX][127:0] -> YMM0[255:0]
//
//  01 N  YMM2  YMM1  YMM0  VPERM2F128,     // YMM1[255:128]  -> YMM0[127:0]
//                                          // YMM1[127:0]    -> YMM0[255:128]
//
//  23 N  YMM2  YMM1  YMM0  VPERM2F128,     // YMM2[255:128]  -> YMM0[127:0]
//                                          // YMM2[127:0]    -> YMM0[255:128]
//
//  88 N  YMM2  YMM1  YMM0  VPERM2F128,     // 0  -> YMM0[255:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be a ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, or ymm register target.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//  Intel docs say this opcode sequence operates on 128 bit floating point values
//   but it probably can be any kind of value. The 0 stored when bits 3 or 7 are
//   set is an integer 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvperm2i128comma ( VPERM2I128, )
//
// C prototype:
//  void dg_forthvperm2i128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist 
//     targetzparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls four targets from the data stack and compiles the opcode sequence for
//   an x86 VPERM2I128 instruction. This opcode sequence uses the value of
//   target w to determine the sources for each 128 bit section of the
//   destination.
//
//  target w bits 3, 1, 0    source for dest[127:0]
//                0  0  0     target y[127:0]
//                0  0  1     target y[255:128]         
//                0  0  0     source[127:0]
//                0  0  1     source[255:128]
//                1  x  x     0                // destination[127:0] is cleared  
//
//  target w bits 7, 5, 4    source for dest[255:128]
//                0  0  0     target y[127:0]
//                0  0  1     target y[255:128]         
//                0  0  0     source[127:0]
//                0  0  1     source[255:128]
//                1  x  x     0                // destination[255:128] is cleared  
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  10 N  RBX [R]  YMM1  YMM0  VPERM2I128,  // YMM1 -> YMM0
//
//  32 N  RBX [R]  YMM1  YMM0  VPERM2I128,  // [RBX][127:0] -> YMM0[255:0]
//
//  01 N  YMM2  YMM1  YMM0  VPERM2I128,     // YMM1[255:128]  -> YMM0[127:0]
//                                          // YMM1[127:0]    -> YMM0[255:128]
//
//  23 N  YMM2  YMM1  YMM0  VPERM2I128,     // YMM2[255:128]  -> YMM0[127:0]
//                                          // YMM2[127:0]    -> YMM0[255:128]
//
//  88 N  YMM2  YMM1  YMM0  VPERM2I128,     // 0  -> YMM0[255:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be a ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, or ymm register target.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpermpdcomma ( VPERMPD, )
//
// C prototype:
//  void dg_forthvpermpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPERMPD instruction. This opcode sequence uses the value of
//   each 2 bit section of target w to determine the sources for each 64 bit 
//   section of the destination. The lowest 2 bits of target w determine the
//   64 bit section of the source for the lowest 64 bits of the destination. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  E4 N  RBX [R]  YMM0  VPERMPD,  // [RBX] -> YMM0
//
//  00 N  RBX [R]  YMM0  VPERMPD,  //  [RBX][63:0] -> YMM0[63:0]
//                                 //  [RBX][63:0] -> YMM0[127:64]
//                                 //  [RBX][63:0] -> YMM0[191:128]
//                                 //  [RBX][63:0] -> YMM0[255:192]
//
//  A5 N  YMM2  YMM0  VPERMPD,     //  YMM2[127:64]  -> YMM0[63:0]
//                                 //  YMM2[127:64]  -> YMM0[127:64]
//                                 //  YMM2[191:128] -> YMM0[191:128]
//                                 //  YMM2[191:128] -> YMM0[255:192]
//
//  1B N  YMM0 <- YMM2  VPERMPD,  //  YMM2[255:192]  -> YMM0[63:0]
//                                   //  YMM2[191:128]  -> YMM0[127:64]
//                                   //  YMM2[127:64]   -> YMM0[191:128]
//                                   //  YMM2[63:0]     -> YMM0[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target 
//   must be a ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, or ymm register target.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//  Intel docs say this opcode sequence works on 64 bit floating point values,
//   but any 64 bit values should work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpermqcomma ( VPERMQ, )
//
// C prototype:
//  void dg_forthvpermqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for targets x and y can contain these addressing mode
//   specifiers:
//
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPERMQ instruction. This opcode sequence uses the value of
//   each 2 bit section of target w to determine the sources for each 64 bit 
//   section of the destination. The lowest 2 bits of target w determine the
//   64 bit section of the source for the lowest 64 bits of the destination. 
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  E4 N  RBX [R]  YMM0  VPERMQ,   // [RBX] -> YMM0
//
//  00 N  RBX [R]  YMM0  VPERMQ,   
//                                 //  [RBX][63:0] -> YMM0[127:64]
//                                 //  [RBX][63:0] -> YMM0[191:128]
//                                 //  [RBX][63:0] -> YMM0[255:192]
//
//  A5 N  YMM2  YMM0  VPERMQ,      //  YMM2[127:64]  -> YMM0[63:0]
//                                 //  YMM2[127:64]  -> YMM0[127:64]
//                                 //  YMM2[191:128] -> YMM0[191:128]
//                                 //  YMM2[191:128] -> YMM0[255:192]
//
//  1B N  YMM0 <- YMM2  VPERMQ,   //  YMM2[255:192]  -> YMM0[63:0]
//                                   //  YMM2[191:128]  -> YMM0[127:64]
//                                   //  YMM2[127:64]   -> YMM0[191:128]
//                                   //  YMM2[63:0]     -> YMM0[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target 
//   must be a ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, or ymm register target.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpermdcomma ( VPERMD, )
//
// C prototype:
//  void dg_forthvpermdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPERMD instruction. This opcode sequence uses the value of
//   the 0 based indexes in each 32 bit section of target y to determine which 32 
//   bit section of the source to copy to each 32 bit section of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  If YMM1 is 00070006000500040003000200010000 then
//   RBX [R]  YMM1  YMM0  VPERMD,           // [RBX] -> YMM0
//
//  IF YMM1 is 0 then
//   RBX [R]  YMM1  YMM0  VPERMD,           // [RBX][31:0] -> YMM0[31:0]
//                                          // [RBX][31:0] -> YMM0[63:32]
//                                          // [RBX][31:0] -> YMM0[95:64]
//                                          // [RBX][31:0] -> YMM0[127:96]
//                                          // [RBX][31:0] -> YMM0[159:128]
//                                          // [RBX][31:0] -> YMM0[191:160]
//                                          // [RBX][31:0] -> YMM0[223:192]
//                                          // [RBX][31:0] -> YMM0[255:224]
//
//  If YMM1 is 00010001000200030004000500060007 then
//   YMM2  YMM1  YMM0  VPERMD,              // YMM2[255:224] -> YMM0[31:0]
//                                          // YMM2[223:192] -> YMM0[63:32]
//                                          // YMM2[191:160] -> YMM0[95:64]
//                                          // YMM2[159:128] -> YMM0[127:96]
//                                          // YMM2[127:96]  -> YMM0[159:128]
//                                          // YMM2[95:64]   -> YMM0[191:160]
//                                          // YMM2[63:32]   -> YMM0[223:192]
//                                          // YMM2[63:32]   -> YMM0[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be a ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, or ymm register target.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//  It looks like only the lowest 3 bits of each index are used and the rest of
//   the the bits are ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpermpscomma ( VPERMPS, )
//
// C prototype:
//  void dg_forthvpermpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//  The parameter list for target y can contain this addressing mode
//   specifier:
//
//   targetymmregister
//
//  The parameter list for targets x and z can contain these addressing mode
//   specifiers:
//
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPERMPS instruction. This opcode sequence uses the value of
//   the 0 based indexes in each 32 bit section of target y to determine which 32 
//   bit section of the source to copy to each 32 bit section of the destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  If YMM1 is 00070006000500040003000200010000 then
//   RBX [R]  YMM1  YMM0  VPERMPS,          // [RBX] -> YMM0
//
//  IF YMM1 is 0 then
//   RBX [R]  YMM1  YMM0  VPERMPS,          // [RBX][31:0] -> YMM0[31:0]
//                                          // [RBX][31:0] -> YMM0[63:32]
//                                          // [RBX][31:0] -> YMM0[95:64]
//                                          // [RBX][31:0] -> YMM0[127:96]
//                                          // [RBX][31:0] -> YMM0[159:128]
//                                          // [RBX][31:0] -> YMM0[191:160]
//                                          // [RBX][31:0] -> YMM0[223:192]
//                                          // [RBX][31:0] -> YMM0[255:224]
//
//  If YMM1 is 00010001000200030004000500060007 then
//   YMM2  YMM1  YMM0  VPERMPS,             // YMM2[255:224] -> YMM0[31:0]
//                                          // YMM2[223:192] -> YMM0[63:32]
//                                          // YMM2[191:160] -> YMM0[95:64]
//                                          // YMM2[159:128] -> YMM0[127:96]
//                                          // YMM2[127:96]  -> YMM0[159:128]
//                                          // YMM2[95:64]   -> YMM0[191:160]
//                                          // YMM2[63:32]   -> YMM0[223:192]
//                                          // YMM2[63:32]   -> YMM0[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y
//   must be a ymm register. The first target must be the immediate target. 
//   If you use -> it must come after a memory, or ymm register target.
//   The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//  It looks like only the lowest 3 bits of each index are used and the rest of
//   the the bits are ignored.
//  Intel docs say this instruction works on 32 bit floating point values, but
//   any 32 bit value should work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpermilpdcomma ( VPERMILPD, )
//
// C prototype:
//  void dg_forthvpermilpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter list for target x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
// 
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPERMILPD instruction. There are two modes for this instruction, one
//   where the indexes are given in an immediate value, and one where the indexes
//   are given in a memory target, xmm register, or ymm register.
//  If the indexes are given in an immediate value, the immediate target must be
//   the first target, and the other two targets are the source and destination.
//   Each bit of the immediate value selects one of two 64 bit sections of the 
//   source to copy to a 64 bit section of the destination. The lower two
//   indexing bits choose from the lower two 64 bit sections of the source. The
//   upper two indexing bits choose from the upper two 64 bit sections of the
//   source. The lowest bit of the immediate value chooses the source for the 
//   lowest 64 bit section of the destination. The source can be a memory target, 
//   xmm register, or ymm register. The destination must be an xmm or ymm 
//   register.
//  If the indexes are not given in an immediate value, the target holding the 
//   indexes must be the first target if not using reverse, or the last target
//   if using reverse, and can be a memory target, xmm register
//   or ymm register. The other two targets are the source and destination and
//   must be xmm or ymm registers. The indexing for this mode is a little weird.
//   The second lowest bit of each 64 bit section of the index target chooses
//   between two sources. For the lowest 128 bits of the index target, the
//   choosing bits choose between the two 64 bit sections in the lowest 128 bits
//   of the source. For the upper 128 bits of the index target, the choosing
//   bits choose between the two 64 bit sections of the upper 128 bits of the
//   source. So if bit 1 of the index target is 0, it chooses the lowest 64 bits
//   of the source to put into the lowest 64 bits of the destination.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  02 N  RAX [R]  XMM1  VPERMILPD,    // [RAX] ->  XMM1
//
//  01 N  RAX [R]  XMM1  VPERMILPD,    // [RAX][127:64] -> XMM1[63:0]
//                                     // [RAX][63:0]   -> XMM1[127:64]
//
//  0A N  YMM0  YMM1  VPERMILPD,       // YMM0 -> YMM1
//
//  05 N  YMM0  YMM1  VPERMILPD,       // YMM0[127:64]  -> YMM1[63:0]
//                                     // YMM0[63:0]    -> YMM1[127:64]
//                                     // YMM0[255:192] -> YMM1[191:128]
//                                     // YMM0[191:128] -> YMM1[255:192]
//
//  If [RAX] = 0000000200000000
//   RAX [R]  XMM0  XMM1  VPERMILPD,   // XMM0 ->  XMM1
//
//  If [RAX] = 0000000000000002
//   RAX [R]  XMM0  XMM1  VPERMILPD,   // XMM0[127:64] -> XMM1[63:0]
//                                     // XMM0[63:0]   -> XMM1[127:64]
//
//  If YMM2 = 00000002000000000000000200000000
//  YMM2  YMM0  YMM1  VPERMILPD,       // YMM0[127:64]  -> YMM1[63:0]
//                                     // YMM0[63:0]    -> YMM1[127:64]
//                                     // YMM0[255:192] -> YMM1[191:128]
//                                     // YMM0[191:128] -> YMM1[255:192]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target can be an immediate target, memory target,
//   xmm register, or ymm register.
//  If the first target is an immediate target, the other two targets are
//   the source and destination targets. Using reverse will make the second
//   target the destination and the third target the source. The source can
//   be a memory target, xmm register, or ymm register.
//  If the first target is not an immediate target, then the first target holds
//   the indexes, and the other two targets are the source and destination. Using
//   reverse will make the first target the destination and the third target the
//   one that holds the indexes. The register that holds the indexes can be a
//   memory target, an xmm register, or ymm register. The source must be an xmm
//   register or ymm register.
//  The destination target must be an xmm or ymm register.
//  If you use -> it must come after a memory, xmm, or ymm register target.
//  The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//  Intel docs say this opcode sequence operates on 64 bit floating point values
//   but it can be any kind of value. 
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpermilpscomma ( VPERMILPS, )
//
// C prototype:
//  void dg_forthvpermilpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
// 
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for the target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter list for target x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
// 
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPERMILPS instruction. There are two modes for this instruction, one
//   where the indexes are given in an immediate value, and one where the indexes
//   are given in a memory target, xmm register, or ymm register.
//  If the indexes are given in an immediate value, the immediate target must be
//   the first target, and the other two targets are the source and destination.
//   Every two bits of the immediate value selects one of four 32 bit sections 
//   of the source to copy to a 32 bit section of the destination for both the
//   lower 128 bits of the source and destination, and the upper 128 bits of the
//   source and destination. The lowest two bits of the immediate value chooses 
//   the source for the lowest 32 bit section of the destination and the lowest
//   32 bit section of the upper 128 bits of the destination. The source can be 
//   a memory target, xmm register, or ymm register. The destination must be an 
//   xmm or ymm register.
//  If the indexes are not given in an immediate value, the target holding the 
//   indexes must be the first target if not using reverse, or the last target
//   if using reverse, and can be a memory target, xmm register
//   or ymm register. The other two targets are the source and destination and
//   must be xmm or ymm registers. The indexing for this mode is a little weird.
//   The lower two bits of each 32 bit section of the index target chooses
//   between four sources. For the lowest 128 bits of the index target, the
//   choosing bits choose between the four 32 bit sections in the lowest 128 bits
//   of the source. For the upper 128 bits of the index target, the choosing
//   bits choose between the four 32 bit sections of the upper 128 bits of the
//   source. So if bits 0 and 1 of the index target is 00, it chooses the lowest 
//   32 bits of the source to put into the lowest 32 bits of the destination.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  E4 N  RAX [R]  XMM1  VPERMILPS,    // [RAX] ->  XMM1
//
//  1B N  RAX [R]  XMM1  VPERMILPS,    // [RAX][127:96] -> XMM1[31:0]
//                                     // [RAX][95:64]  -> XMM1[63:32]
//                                     // [RAX][63:32]  -> XMM1[95:64]
//                                     // [RAX][31:0]   -> XMM1[127:96]
//
//  E4 N  YMM0  YMM1  VPERMILPS,       // YMM0 -> YMM1
//
//  1B N  YMM0  YMM1  VPERMILPS,       // [RAX][127:96]  -> YMM1[31:0]
//                                     // [RAX][95:64]   -> YMM1[63:32]
//                                     // [RAX][63:32]   -> YMM1[95:64]
//                                     // [RAX][31:0]    -> YMM1[127:96]
//                                     // [RAX][255:224] -> YMM1[159:128]
//                                     // [RAX][223:192] -> YMM1[191:160]
//                                     // [RAX][191:160] -> YMM1[223:192]
//                                     // [RAX][159:128] -> YMM1[255:224]
//
//  If [RAX] = 0003000200010000
//   RAX [R]  XMM0  XMM1  VPERMILPS,   // XMM0 ->  XMM1
//
//  If [RAX] = 0000000100020003
//   RAX [R]  XMM0  XMM1  VPERMILPS,   // [RAX][127:96] -> XMM1[31:0]
//                                     // [RAX][95:64]  -> XMM1[63:32]
//                                     // [RAX][63:32]  -> XMM1[95:64]
//                                     // [RAX][31:0]   -> XMM1[127:96]
//
//  If YMM2 = 00000001000200030000000100020003
//  YMM2  YMM0  YMM1  VPERMILPS,       // YMM0[127:96]  -> YMM1[31:0]
//                                     // YMM0[95:64]   -> YMM1[63:32]
//                                     // YMM0[63:32]   -> YMM1[95:64]
//                                     // YMM0[31:0]    -> YMM1[127:96]
//                                     // YMM0[255:224] -> YMM1[159:128]
//                                     // YMM0[223:192] -> YMM1[191:160]
//                                     // YMM0[191:160] -> YMM1[223:192]
//                                     // YMM0[159:128] -> YMM1[255:224]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register. The first target can be an immediate target, memory target,
//   xmm register, or ymm register.
//  If the first target is an immediate target, the other two targets are
//   the source and destination targets. Using reverse will make the second
//   target the destination and the third target the source. The source can
//   be a memory target, xmm register, or ymm register.
//  If the first target is not an immediate target, then the first target holds
//   the indexes, and the other two targets are the source and destination. Using
//   reverse will make the first target the destination and the third target the
//   one that holds the indexes. The register that holds the indexes can be a
//   memory target, an xmm register, or ymm register. The source must be an xmm
//   register or ymm register.
//  The destination target must be an xmm or ymm register.
//  If you use -> it must come after a memory, xmm, or ymm register target.
//  The immediate target's  size is one byte, so if you use IMMEDIATE to specify 
//   a minimum size, it must be 1 or 0.
//  Intel docs say this opcode sequence operates on 64 bit floating point values
//   but it can be any kind of value. 
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpslldcomma ( VPSLLD, )
//
// C prototype:
//  void dg_forthvpslldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSLLD instruction. This opcode sequence logical shifts each 32 bit
//   value in the source xmm register left the number of bits specified in the
//   count value and puts the result into the destination. If the count value
//   is greater than 32, the destination is cleared. The count can come from an
//   8 bit immediate value, an xmm register, or memory.
//   If the count is from memory, a 128 bit value is read from memory.
//   Zeros are shifted in from the right.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSLLD,     //  nothing happens
//
//  1 N  XMM1  XMM2  VPSLLD,     // XMM1[31:0]   << 1 -> XMM2[31:0]
//                               // XMM1[63:32]  << 1 -> XMM2[63:31]
//                               // XMM1[95:64]  << 1 -> XMM2[95:64]
//                               // XMM1[127:96] << 1 -> XMM2[127:96]
//
//  2 N  XMM1  XMM2  VPSLLD,     // XMM1[31:0]   << 2 -> XMM2[31:0]
//                               // XMM1[63:32]  << 2 -> XMM2[63:31]
//                               // XMM1[95:64]  << 2 -> XMM2[95:64]
//                               // XMM1[127:96] << 2 -> XMM2[127:96]
//
//  32 N  XMM1  XMM2  VPSLLD,    // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSLLD,    // XMM1[31:0]   << XMM0 -> XMM2[31:0]
//                               // XMM1[63:32]  << XMM0 -> XMM2[63:31]
//                               // XMM1[95:64]  << XMM0 -> XMM2[95:64]
//                               // XMM1[127:96] << XMM0 -> XMM2[127:96]
//
//  RAX [R]  XMM1  XMM2  VPSLLD, // XMM1[31:0]   << [RAX][63:0]
//                               //  -> XMM2[31:0]
//                               // XMM1[63:32]  << [RAX][63:0]
//                               //  -> XMM2[63:31]
//                               // XMM1[95:64]  << [RAX][63:0]
//                               //  -> XMM2[95:64]
//                               // XMM1[127:96] << [RAX][63:0]
//                               //  -> XMM2[127:96]
//
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsllqcomma ( VPSLLQ, )
//
// C prototype:
//  void dg_forthvpsllqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSLLQ instruction. This opcode sequence logical shifts each 64 bit
//   value in the source xmm or ymm register left the number of bits specified
//   in the count value. Then puts the result into the destination xmm or ymm
//   register. If the count value is greater than 63, the destination is
//   cleared. The count can come from an xmm register, an 8 bit immediate
//   value, or memory. If the count is vrom memory, a 128 bit value is read
//   from memory.
//   Zeros are shifted in from the right.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSLLQ,     //  nothing happens
//
//  1 N  XMM1  XMM2  VPSLLQ,     // XMM1[63:0]    << 1 -> XMM2[63:0]
//                               // XMM1[127:64]  << 1 -> XMM2[127:64]
//
//  2 N  XMM1  XMM2  VPSLLQ,     // XMM1[63:0]    << 2 -> XMM2[63:0]
//                               // XMM1[127:64]  << 2 -> XMM2[127:64]
//
//  64 N  XMM1  XMM2  VPSLLQ,    // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSLLQ,    // XMM1[63:0]   << XMM0 -> XMM2[63:0]
//                               // XMM1[127:64] << XMM0 -> XMM2[127:64]
//
//  RAX [R]  XMM1  XMM2  VPSLLQ, // XMM1[63:0]   << [RAX][63:0]
//                               //  -> XMM2[63:0]
//                               // XMM1[127:64] << [RAX][63:0]
//                               //  -> XMM2[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsllwcomma ( VPSLLW, )
//
// C prototype:
//  void dg_forthvpsllwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPSLLW instruction. This opcode sequence logical shifts each 16 bit
//   value in the source xmm or ymm register left the number of bits specified
//   in the count value. Then puts the result into the destination.
//   Zeros are shifted in from the right. If the count value is greater than 15,
//   the destination is cleared. The count can come from an xmm register, an
//   8 bit immediate value, or memory. If the count is from memory, a 128 bit
//   value is read from memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSLLW,    //  nothing happens
//
//  1 N  XMM1  XMM2  VPSLLW,    // XMM1[15:0]    << 1 -> XMM2[15:0]
//                              // XMM1[31:16]   << 1 -> XMM2[31:16]
//                              // XMM1[47:32]   << 1 -> XMM2[47:32]
//                              // XMM1[63:48]   << 1 -> XMM2[63:48]
//                              // XMM1[79:64]   << 1 -> XMM2[79:64]
//                              // XMM1[95:80]   << 1 -> XMM2[95:80]
//                              // XMM1[111:96]  << 1 -> XMM2[111:96]
//                              // XMM1[127:112] << 1 -> XMM2[127:112]
//
//  2 N  XMM1  XMM2  VPSLLW,    // XMM1[15:0]    << 2 -> XMM2[15:0]
//                              // XMM1[31:16]   << 2 -> XMM2[31:16]
//                              // XMM1[47:32]   << 2 -> XMM2[47:32]
//                              // XMM1[63:48]   << 2 -> XMM2[63:48]
//                              // XMM1[79:64]   << 2 -> XMM2[79:64]
//                              // XMM1[95:80]   << 2 -> XMM2[95:80]
//                              // XMM1[111:96]  << 2 -> XMM2[111:96]
//                              // XMM1[127:112] << 2 -> XMM2[127:112]
//
//  16 N  XMM2  VPSLLW,         // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSLLW,   // XMM1[15:0]    << XMM0
//                              //  -> XMM2[15:0]
//                              // XMM1[31:16]   << XMM0
//                              //  -> XMM2[31:16]
//                              // XMM1[47:32]   << XMM0
//                              //  -> XMM2[47:32]
//                              // XMM1[63:48]   << XMM0
//                              //  -> XMM2[63:48]
//                              // XMM1[79:64]   << XMM0
//                              //  -> XMM2[79:64]
//                              // XMM1[95:80]   << XMM0
//                              //  -> XMM2[95:80]
//                              // XMM1[111:96]  << XMM0
//                              //  -> XMM2[111:96]
//                              // XMM1[127:112] << XMM0
//                              //  -> XMM2[127:112]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsradcomma ( VPSRAD, )
//
// C prototype:
//  void dg_forthvpsradcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing
//   mode specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRAD instruction. This opcode sequence arithmetically shifts each
//   32 bit value in the source xmm or ymm register right the number of bits
//   specified in the count value and puts the result into the destination.
//   The count value can come from an immediate value, an xmm register, or
//   memory. If the count is from memory, a 128 bit value is read from memory.
//   Copies of the high order bit of each 32 bit value are shifted in from
//   the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSRAD,     //  nothing happens
//
//  1 N  XMM1  XMM2  VPSRAD,     // XMM1[31:0]   >> 1 -> XMM2[31:0]
//                               // XMM1[63:32]  >> 1 -> XMM2[63:31]
//                               // XMM1[95:64]  >> 1 -> XMM2[95:64]
//                               // XMM1[127:96] >> 1 -> XMM2[127:96]
//
//  2 N  XMM1  XMM2  VPSRAD,     // XMM1[31:0]   >> 2 -> XMM2[31:0]
//                               // XMM1[63:32]  >> 2 -> XMM2[63:31]
//                               // XMM1[95:64]  >> 2 -> XMM2[95:64]
//                               // XMM1[127:96] >> 2 -> XMM2[127:96]
//
//  32 N  XMM1  XMM2  VPSRAD,    // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSRAD,    // XMM1[31:0]   >> XMM0 -> XMM2[31:0]
//                               // XMM1[63:32]  >> XMM0 -> XMM2[63:31]
//                               // XMM1[95:64]  >> XMM0 -> XMM2[95:64]
//                               // XMM1[127:96] >> XMM0 -> XMM2[127:96]
//
//  RAX [R]  XMM1  XMM2  VPSRAD, // XMM1[31:0]   >> [RAX][63:0]
//                               //  -> XMM2[31:0]
//                               // XMM1[63:32]  >> [RAX][63:0]
//                               //  -> XMM2[63:31]
//                               // XMM1[95:64]  >> [RAX][63:0]
//                               //  -> XMM2[95:64]
//                               // XMM1[127:96] >> [RAX][63:0]
//                               //  -> XMM2[127:96]
//
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsrawcomma ( VPSRAW, )
//
// C prototype:
//  void dg_forthvpsrawcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRAW instruction. This opcode sequence arithmetically shifts each
//   16 bit value in the source xmm or ymm register right the number of bits
//   specified in the count value. Then puts the result into the destination.
//   Copies of the high order bit of each 16 bit value are shifted in from the
//   left. The count can come from an xmm register, an 8 bit immediate value,
//   or memory. If the count is from memory, a 128 bit value is read from
//   memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSRAW,    //  nothing happens
//
//  1 N  XMM1  XMM2  VPSRAW,    // XMM1[15:0]    >> 1 -> XMM2[15:0]
//                              // XMM1[31:16]   >> 1 -> XMM2[31:16]
//                              // XMM1[47:32]   >> 1 -> XMM2[47:32]
//                              // XMM1[63:48]   >> 1 -> XMM2[63:48]
//                              // XMM1[79:64]   >> 1 -> XMM2[79:64]
//                              // XMM1[95:80]   >> 1 -> XMM2[95:80]
//                              // XMM1[111:96]  >> 1 -> XMM2[111:96]
//                              // XMM1[127:112] >> 1 -> XMM2[127:112]
//
//  2 N  XMM1  XMM2  VPSRAW,    // XMM1[15:0]    >> 2 -> XMM2[15:0]
//                              // XMM1[31:16]   >> 2 -> XMM2[31:16]
//                              // XMM1[47:32]   >> 2 -> XMM2[47:32]
//                              // XMM1[63:48]   >> 2 -> XMM2[63:48]
//                              // XMM1[79:64]   >> 2 -> XMM2[79:64]
//                              // XMM1[95:80]   >> 2 -> XMM2[95:80]
//                              // XMM1[111:96]  >> 2 -> XMM2[111:96]
//                              // XMM1[127:112] >> 2 -> XMM2[127:112]
//
//  16 N  XMM2  VPSRAW,         // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSRAW,   // XMM1[15:0]    >> XMM0
//                              //  -> XMM2[15:0]
//                              // XMM1[31:16]   >> XMM0
//                              //  -> XMM2[31:16]
//                              // XMM1[47:32]   >> XMM0
//                              //  -> XMM2[47:32]
//                              // XMM1[63:48]   >> XMM0
//                              //  -> XMM2[63:48]
//                              // XMM1[79:64]   >> XMM0
//                              //  -> XMM2[79:64]
//                              // XMM1[95:80]   >> XMM0
//                              //  -> XMM2[95:80]
//                              // XMM1[111:96]  >> XMM0
//                              //  -> XMM2[111:96]
//                              // XMM1[127:112] >> XMM0
//                              //  -> XMM2[127:112]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsrldcomma ( VPSRLD, )
//
// C prototype:
//  void dg_forthvpsrldcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRLD instruction. This opcode sequence logical shifts each 32 bit
//   value in the source xmm or ymm register right the number of bits specified
//   in the count value and puts the result into the destination. If the count
//   value is greater than 32, the destination is cleared. The count value can
//   come from an immediate value, an xmm register, or memory.
//   If the count is from memory, a 128 bit value is read from memory.
//   Zeros are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSRLD,     //  nothing happens
//
//  1 N  XMM1  XMM2  VPSRLD,     // XMM1[31:0]   >> 1 -> XMM2[31:0]
//                               // XMM1[63:32]  >> 1 -> XMM2[63:31]
//                               // XMM1[95:64]  >> 1 -> XMM2[95:64]
//                               // XMM1[127:96] >> 1 -> XMM2[127:96]
//
//  2 N  XMM1  XMM2  VPSRLD,     // XMM1[31:0]   >> 2 -> XMM2[31:0]
//                               // XMM1[63:32]  >> 2 -> XMM2[63:31]
//                               // XMM1[95:64]  >> 2 -> XMM2[95:64]
//                               // XMM1[127:96] >> 2 -> XMM2[127:96]
//
//  32 N  XMM1  XMM2  VPSRLD,    // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSRLD,    // XMM1[31:0]   >> XMM0 -> XMM2[31:0]
//                               // XMM1[63:32]  >> XMM0 -> XMM2[63:31]
//                               // XMM1[95:64]  >> XMM0 -> XMM2[95:64]
//                               // XMM1[127:96] >> XMM0 -> XMM2[127:96]
//
//  RAX [R]  XMM1  XMM2  VPSRLD, // XMM1[31:0]   >> [RAX][63:0]
//                               //  -> XMM2[31:0]
//                               // XMM1[63:32]  >> [RAX][63:0]
//                               //  -> XMM2[63:31]
//                               // XMM1[95:64]  >> [RAX][63:0]
//                               //  -> XMM2[95:64]
//                               // XMM1[127:96] >> [RAX][63:0]
//                               //  -> XMM2[127:96]
//
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsrlqcomma ( VPSRLQ, )
//
// C prototype:
//  void dg_forthvpsrlqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRLQ instruction. This opcode sequence logical shifts each 64 bit
//   value in the source xmm or ymm register right the number of bits specified
//   in the count value. Then puts the result into the destination xmm or ymm
//   register. If the count value is greater than 63, the destination is
//   cleared. The count can come from an xmm register, an 8 bit immediate
//   value, or memory. If the count is from memory, a 128 bit value is read from
//   memory.
//   Zeros are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSRLQ,     //  nothing happens
//
//  1 N  XMM1  XMM2  VPSRLQ,     // XMM1[63:0]    >l> 1 -> XMM2[63:0]
//                               // XMM1[127:64]  >l> 1 -> XMM2[127:64]
//
//  2 N  XMM1  XMM2  VPSRLQ,     // XMM1[63:0]    >l> 2 -> XMM2[63:0]
//                               // XMM1[127:64]  >l> 2 -> XMM2[127:64]
//
//  64 N  XMM1  XMM2  VPSRLQ,    // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSRLQ,    // XMM1[63:0]   >l> XMM0 -> XMM2[63:0]
//                               // XMM1[127:64] >l> XMM0 -> XMM2[127:64]
//
//  RAX [R]  XMM1  XMM2  VPSRLQ, // XMM1[63:0]   >l> [RAX][63:0]
//                               //  -> XMM2[63:0]
//                               // XMM1[127:64] >l> [RAX][63:0]
//                               //  -> XMM2[127:64]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsrlwcomma ( VPSRLW, )
//
// C prototype:
//  void dg_forthvpsrlwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetcountparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetcountparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for targetcountparameterlist can contain these addressing mode
//   specifiers:
//
//   datavalue N
//   datavalue 1 IMMEDIATE
//   targetxmmregister
//   targetxmmregister XMMR
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  The parameter lists for targetxparameterlist or targetyparameterlist contain
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetxmmregister XMMR
//   targetymmregister
//   targetymmregister YMMR
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               an 8 bit value
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible and can't be larger than a signed
//                                 32 bit integer
//
//   minimumimmediatesize         minimum size used to encode the immediate
//                                 value in bytes, can be either 0, 1
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding. If this is not
//                                 used then 2 byte vex encoding is used if
//                                 possible.
//                                 This is pushed after addressing mode
//                                 parameters in a target's parameter list
//                                 and can not come in the middle
//                                 of addressing mode parameters.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRLW instruction. This opcode sequence logical shifts each 16 bit
//   value in the source xmm or ymm register right the number of bits specified
//   in the count value. Then puts the result into the destination.
//   Zeros are shifted in from the left. If the count value is greater than 15,
//   the destination is cleared. The count can come from an xmm register, an
//   8 bit immediate value, or memory. If the count from  is memory, a 128 bit
//   value is read from memory.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit mode examples:
//  HEX
//  0 N  XMM1  XMM2  VPSRLW,    //  nothing happens
//
//  1 N  XMM1  XMM2  VPSRLW,    // XMM1[15:0]    >> 1 -> XMM2[15:0]
//                              // XMM1[31:16]   >> 1 -> XMM2[31:16]
//                              // XMM1[47:32]   >> 1 -> XMM2[47:32]
//                              // XMM1[63:48]   >> 1 -> XMM2[63:48]
//                              // XMM1[79:64]   >> 1 -> XMM2[79:64]
//                              // XMM1[95:80]   >> 1 -> XMM2[95:80]
//                              // XMM1[111:96]  >> 1 -> XMM2[111:96]
//                              // XMM1[127:112] >> 1 -> XMM2[127:112]
//
//  2 N  XMM1  XMM2  VPSRLW,    // XMM1[15:0]    >> 2 -> XMM2[15:0]
//                              // XMM1[31:16]   >> 2 -> XMM2[31:16]
//                              // XMM1[47:32]   >> 2 -> XMM2[47:32]
//                              // XMM1[63:48]   >> 2 -> XMM2[63:48]
//                              // XMM1[79:64]   >> 2 -> XMM2[79:64]
//                              // XMM1[95:80]   >> 2 -> XMM2[95:80]
//                              // XMM1[111:96]  >> 2 -> XMM2[111:96]
//                              // XMM1[127:112] >> 2 -> XMM2[127:112]
//
//  16 N  XMM2  VPSRLW,         // 0 -> XMM2[127:0]
//
//  XMM0  XMM1  XMM2  VPSRLW,   // XMM1[15:0]    >> XMM0
//                              //  -> XMM2[15:0]
//                              // XMM1[31:16]   >> XMM0
//                              //  -> XMM2[31:16]
//                              // XMM1[47:32]   >> XMM0
//                              //  -> XMM2[47:32]
//                              // XMM1[63:48]   >> XMM0
//                              //  -> XMM2[63:48]
//                              // XMM1[79:64]   >> XMM0
//                              //  -> XMM2[79:64]
//                              // XMM1[95:80]   >> XMM0
//                              //  -> XMM2[95:80]
//                              // XMM1[111:96]  >> XMM0
//                              //  -> XMM2[111:96]
//                              // XMM1[127:112] >> XMM0
//                              //  -> XMM2[127:112]
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdrandcomma ( RDRAND, )
//
// C prototype:
//  void dg_forthrdrandcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//  ( targetparameterlist -- )
//
// Data stack in:
//  targetparameterlist
//
//  The parameter list for the target can contain these addressing mode specifiers:
//   targetregister R
//   targetregister
//
//  Description of these parameters:
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   R                            specifies a register target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 RDRAND instruction. The opcode sequence puts a random number
//   into the target register. The carry flag determines if a random number
//   is available. If the carry flag is 0 after this instruction, the register
//   gets 0 and you are supposed to try again later.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  BEGIN,
//    AX RDRAND,
//  CS UNTIL,
//
//  BEGIN,
//    EAX RDRAND,
//  CS UNTIL,
//
//  BEGIN,
//    RAX RDRAND,
//  CS UNTIL,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrdseedcomma ( RDSEED, )
//
// C prototype:
//  void dg_forthrdseedcomma(Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 used as the bufferhandle for the array where
//                                 the other bufferhandles are stored.
//
// Stack action shorthand:
//  ( targetparameterlist -- )
//
// Data stack in:
//  targetparameterlist
//
//  The parameter list for the target can contain these addressing mode specifiers:
//   targetregister R
//   targetregister
//
//  Description of these parameters:
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   R                            specifies a register target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls one target from the data stack and compiles the opcode sequence for
//   an x86 RDSEED instruction. The opcode sequence puts a random number
//   into the target register. The random number is generated using an
//   enhanced non deterministic bit generator that is compliant with the
//   NIST SP800-90B & C standard.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  AX RDSEED,
//
//  EAX RDSEED,
//
//  RAX RDSEED,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvextractf128comma ( VEXTRACTF128, )
//
// C prototype:
//  void dg_forthvextractf128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VEXTRACTF128 instruction. This opcode sequence copies a 128 bit value
//   from either the lower or higher 128 bits of the ymm register source depending 
//   on whether bit 0 of the immediate target is a 0 or 1 and puts the value into 
//   the memory or xmm register destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  YMM0  RBX [R]  VEXTRACTF128,    // YMM0[127:0]     -> [RBX][127:0]
//
//  01 N  YMM0  RBX [R]  VEXTRACTF128,    // YMM0[255:128]   -> [RBX][127:0]
//
//  01 N  XMM2 <-  YMM0  VEXTRACTF128, // YMM0[255:128] -> XMM2[127:0]
//
//  01 N  YMM0  XMM8 VEXTRACTF128,        // YMM0[255:128] -> XMM8[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or memory. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0. The source must be a ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvextracti128comma ( VEXTRACTI128, )
//
// C prototype:
//  void dg_forthvextracti128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VEXTRACTI128 instruction. This opcode sequence copies a 128 bit value
//   from either the lower or higher 128 bits of the ymm register source depending 
//   on whether bit 0 of the immediate target is a 0 or 1 and puts the value into 
//   the memory or xmm register destination.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  YMM0  RBX [R]  VEXTRACTI128,    // YMM0[127:0]     -> [RBX][127:0]
//
//  01 N  YMM0  RBX [R]  VEXTRACTI128,    // YMM0[255:128]   -> [RBX][127:0]
//
//  01 N  XMM2 <-  YMM0  VEXTRACTI128, // YMM0[255:128] -> XMM2[127:0]
//
//  01 N  YMM0  XMM8 VEXTRACTI128,        // YMM0[255:128] -> XMM8[127:0]
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   register or memory. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0. The source must be a ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvgatherdpdcomma ( VGATHERDPD, )
//
// C prototype:
//  void dg_forthvgatherdpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                 (scale*(indexregister[31:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[63:32]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[95:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:96]) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies a memory target at the address at
//                                 baseregister + 
//                                 (scale*(indexregister[31:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[63:32]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[95:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:96]) +
//                                 displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VGATHERDPD instruction. This opcode sequence copies each of 2 or 4
//   double precision floating point values from the sources if the high bit of
//   corresponding 64 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 64 bit value in the mask
//   is clear, the corresponding 64 bit value in the destination is left
//   unchanged. This opcode sequence partitions the lower 128 bits of the ymmr  
//   index register into 4 separate 32 bit signed indexes to generate 4
//   different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VGATHERDPD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[31:0]][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + YMM2[63:32]][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VGATHERDPD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[31:0]) - 8][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//   
//   // if XMM0[127] is 1 then   
//   //  [RAX + (2*YMM2[63:32]) - 8][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VGATHERDPD,
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[31:0]][63:0] -> YMM1[63:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[63:32]][127:64] -> YMM1[127:64]
//   // else
//   //  0 -> YMM1[127:64]
//
//   // if YMM0[191] is 1 then   
//   //  [RAX + YMM2[95:64]][191:128] -> YMM1[191:128]
//   // else
//   //  0 -> YMM1[191:128]
//
//   // if YMM0[255] is 1 then   
//   //  [RAX + YMM2[127:96]][255:192] -> YMM1[255:192]
//   // else
//   //  0 -> YMM1[255:192]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 2 or 4 64 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//  It probably does not matter if the values copied are 64 bit floating point
//   values, any 64 bit value should work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvgatherqpdcomma ( VGATHERQPD, )
//
// C prototype:
//  void dg_forthvgatherqpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                 (scale*(indexregister[63:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:192]) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies a memory target at the address at
//                                 baseregister + 
//                                 (scale*(indexregister[63:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:192]) +
//                                 displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VGATHERQPD instruction. This opcode sequence copies each of 2 or 4
//   double precision floating point values from the sources if the high bit of
//   corresponding 64 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 64 bit value in the mask
//   is clear, the corresponding 64 bit value in the destination is left
//   unchanged. This opcode sequence partitions the ymmr  
//   index register into 4 separate 64 bit signed indexes to generate 4
//   different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VGATHERDPD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[63:0]][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + YMM2[127:64]][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VGATHERDPD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[63:0]) - 8][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//   
//   // if XMM0[127] is 1 then   
//   //  [RAX + (2*YMM2[127:64]) - 8][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VGATHERDPD,
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[63:0]][63:0] -> YMM1[63:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[127:64]][127:64] -> YMM1[127:64]
//   // else
//   //  0 -> YMM1[127:64]
//
//   // if YMM0[191] is 1 then   
//   //  [RAX + YMM2[191:128]][191:128] -> YMM1[191:128]
//   // else
//   //  0 -> YMM1[191:128]
//
//   // if YMM0[255] is 1 then   
//   //  [RAX + YMM2[255:192]][255:192] -> YMM1[255:192]
//   // else
//   //  0 -> YMM1[255:192]
//
//   // 0 -> YMM0
//
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 2 or 4 64 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//  It probably does not matter if the values copied are 64 bit floating point
//   values, any 64 bit value should work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvgatherdpscomma ( VGATHERDPS, )
//
// C prototype:
//  void dg_forthvgatherdpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[31:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[63:32]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[95:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:96]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[159:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:160]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[223:192]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:224]) +
//                                  displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[31:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[63:32]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[95:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:96]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[159:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:160]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[223:192]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:224]) +
//                                  displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VGATHERDPS instruction. This opcode sequence copies each of 4 or 8
//   single precision floating point values from the sources if the high bit of
//   corresponding 32 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 32 bit value in the mask
//   is clear, the corresponding 32 bit value in the destination is left
//   unchanged. This opcode sequence partitions the ymmr  
//   index register into 8 separate 32 bit signed indexes to generate 8
//   different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VGATHERDPS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + YMM2[31:0]][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[63:32]][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // if XMM0[95] is 1 then   
//   //  [RAX + YMM2[95:64]][95:64] -> XMM1[95:64]
//   // else
//   //  0 -> XMM1[95:64]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + YMM2[127:96]][127:96] -> XMM1[127:96]
//   // else
//   //  0 -> XMM1[127:96]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VGATHERDPS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + (2*YMM2[31:0]) - 8][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[63:32]) - 8][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // if XMM0[95] is 1 then   
//   //  [RAX + (2*YMM2[95:64]) - 8][95:64] -> XMM1[95:64]
//   // else
//   //  0 -> XMM1[95:64]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + (2*YMM2[127:96]) - 8][127:96] -> XMM1[127:96]
//   // else
//   //  0 -> XMM1[127:96]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VGATHERDPS,
//   // if YMM0[31] is 1 then   
//   //  [RAX + YMM2[31:0]][31:0] -> YMM1[31:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[63:32]][63:32] -> YMM1[63:32]
//   // else
//   //  0 -> YMM1[63:32]
//
//   // if YMM0[95] is 1 then   
//   //  [RAX + YMM2[95:64]][95:64] -> YMM1[95:64]
//   // else
//   //  0 -> YMM1[95:64]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[127:96]][127:96] -> YMM1[127:96]
//   // else
//   //  0 -> YMM1[127:96]
//
//   // if YMM0[159] is 1 then   
//   //  [RAX + YMM2[159:128]][159:128] -> YMM1[159:128]
//   // else
//   //  0 -> YMM1[159:128]
//
//   // if YMM0[191] is 1 then   
//   //  [RAX + YMM2[191:160]][191:160] -> YMM1[191:160]
//   // else
//   //  0 -> YMM1[191:160]
//
//   // if YMM0[223] is 1 then   
//   //  [RAX + YMM2[223:192]][223:192] -> YMM1[223:192]
//   // else
//   //  0 -> YMM1[223:192]
//
//   // if YMM0[255] is 1 then   
//   //  [RAX + YMM2[255:224]][255:224] -> YMM1[255:224]
//   // else
//   //  0 -> YMM1[255:224]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 4 or 8 32 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//  It probably does not matter if the values copied are 32 bit floating point
//   values, any 32 bit value should work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpgatherdscomma ( VPGATHERDS, )
//
// C prototype:
//  void dg_forthvpgatherdscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[31:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[63:32]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[95:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:96]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[159:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:160]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[223:192]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:224]) +
//                                  displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[31:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[63:32]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[95:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:96]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[159:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:160]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[223:192]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:224]) +
//                                  displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPGATHERDS instruction. This opcode sequence copies each of 4 or 8
//   32 bit values from the sources if the high bit of
//   corresponding 32 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 32 bit value in the mask
//   is clear, the corresponding 32 bit value in the destination is left
//   unchanged. This opcode sequence partitions the ymmr  
//   index register into 8 separate 32 bit signed indexes to generate 8
//   different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VPGATHERDS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + YMM2[31:0]][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[63:32]][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // if XMM0[95] is 1 then   
//   //  [RAX + YMM2[95:64]][95:64] -> XMM1[95:64]
//   // else
//   //  0 -> XMM1[95:64]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + YMM2[127:96]][127:96] -> XMM1[127:96]
//   // else
//   //  0 -> XMM1[127:96]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VPGATHERDS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + (2*YMM2[31:0]) - 8][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[63:32]) - 8][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // if XMM0[95] is 1 then   
//   //  [RAX + (2*YMM2[95:64]) - 8][95:64] -> XMM1[95:64]
//   // else
//   //  0 -> XMM1[95:64]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + (2*YMM2[127:96]) - 8][127:96] -> XMM1[127:96]
//   // else
//   //  0 -> XMM1[127:96]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VPGATHERDS,
//   // if YMM0[31] is 1 then   
//   //  [RAX + YMM2[31:0]][31:0] -> YMM1[31:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[63:32]][63:32] -> YMM1[63:32]
//   // else
//   //  0 -> YMM1[63:32]
//
//   // if YMM0[95] is 1 then   
//   //  [RAX + YMM2[95:64]][95:64] -> YMM1[95:64]
//   // else
//   //  0 -> YMM1[95:64]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[127:96]][127:96] -> YMM1[127:96]
//   // else
//   //  0 -> YMM1[127:96]
//
//   // if YMM0[159] is 1 then   
//   //  [RAX + YMM2[159:128]][159:128] -> YMM1[159:128]
//   // else
//   //  0 -> YMM1[159:128]
//
//   // if YMM0[191] is 1 then   
//   //  [RAX + YMM2[191:160]][191:160] -> YMM1[191:160]
//   // else
//   //  0 -> YMM1[191:160]
//
//   // if YMM0[223] is 1 then   
//   //  [RAX + YMM2[223:192]][223:192] -> YMM1[223:192]
//   // else
//   //  0 -> YMM1[223:192]
//
//   // if YMM0[255] is 1 then   
//   //  [RAX + YMM2[255:224]][255:224] -> YMM1[255:224]
//   // else
//   //  0 -> YMM1[255:224]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 4 or 8 32 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvgatherqpscomma ( VGATHERQPS, )
//
// C prototype:
//  void dg_forthvgatherqpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[63:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:192]) +
//                                  displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[63:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:192]) +
//                                  displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VGATHERDPS instruction. This opcode sequence copies each of 2 or 4
//   single precision floating point values from the sources if the high bit of
//   corresponding 32 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 32 bit value in the mask
//   is clear, the corresponding 32 bit value in the destination is left
//   unchanged. This opcode sequence partitions the ymmr  
//   index register into 4 separate 64 bit signed indexes to generate 4
//   different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VGATHERDPS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + YMM2[63:0]][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[127:64]][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VGATHERDPS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + (2*YMM2[63:0]) - 8][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[127:64]) - 8][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VGATHERDPS,
//   // if YMM0[31] is 1 then   
//   //  [RAX + YMM2[63:0]][31:0] -> YMM1[31:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[127:64]][63:32] -> YMM1[63:32]
//   // else
//   //  0 -> YMM1[63:32]
//
//   // if YMM0[95] is 1 then   
//   //  [RAX + YMM2[191:128]][95:64] -> YMM1[95:64]
//   // else
//   //  0 -> YMM1[95:64]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[255:192]][127:96] -> YMM1[127:96]
//   // else
//   //  0 -> YMM1[127:96]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 2 or 4 32 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask and index registers must be an xmm or ymm register and should be 
//   the same size as the source, but if they are not the same size
//   their size is ignored.
//  It probably does not matter if the values copied are 32 bit floating point
//   values, any 32 bit value should work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpgatherqscomma ( VPGATHERQS, )
//
// C prototype:
//  void dg_forthvpgatherqscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[63:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:192]) +
//                                  displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies memory targets at the addresses at
//                                 baseregister + 
//                                  (scale*(indexregister[63:0]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[127:64]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[191:128]) +
//                                  displacement (displacement is signed)
//                                 baseregister + 
//                                  (scale*(indexregister[255:192]) +
//                                  displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPGATHERQS instruction. This opcode sequence copies each of 2 or 4
//   32 bit values from the sources if the high bit of
//   corresponding 32 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 32 bit value in the mask
//   is clear, the corresponding 32 bit value in the destination is left
//   unchanged. This opcode sequence partitions the ymmr  
//   index register into 4 separate 64 bit signed indexes to generate 4
//   different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VPGATHERQS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + YMM2[63:0]][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[127:64]][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VPGATHERQS,
//   // if XMM0[31] is 1 then   
//   //  [RAX + (2*YMM2[63:0]) - 8][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[127:64]) - 8][63:31] -> XMM1[63:31]
//   // else
//   //  0 -> XMM1[63:31]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VPGATHERQS,
//   // if YMM0[31] is 1 then   
//   //  [RAX + YMM2[63:0]][31:0] -> YMM1[31:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[127:64]][63:32] -> YMM1[63:32]
//   // else
//   //  0 -> YMM1[63:32]
//
//   // if YMM0[95] is 1 then   
//   //  [RAX + YMM2[191:128]][95:64] -> YMM1[95:64]
//   // else
//   //  0 -> YMM1[95:64]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[255:192]][127:96] -> YMM1[127:96]
//   // else
//   //  0 -> YMM1[127:96]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 2 or 4 32 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask and index registers must be an xmm or ymm register and should be 
//   the same size as the source, but if they are not the same size
//   their size is ignored.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpgatherddcomma ( VPGATHERDD, )
//
// C prototype:
//  void dg_forthvpgatherddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                 (scale*(indexregister[31:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[63:32]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[95:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:96]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[159:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:160]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[223:192]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:224]) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies a memory target at the address at
//                                 baseregister + 
//                                 (scale*(indexregister[31:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[63:32]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[95:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:96]) +
//                                 displacement (displacement is signed)
//                                 (scale*(indexregister[159:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:160]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[223:192]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:224]) +
//                                 displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPGATHERDD instruction. This opcode sequence copies each of 4 or 8
//   32 bit values from the sources if the high bit of corresponding 32 bit
//   value in the mask is set and puts the values into the destination. If the 
//   high bit of the corresponding 32 bit value in the mask is clear, the 
//   corresponding 32 bit value in the destination is left unchanged. 
//   This opcode sequence also clears the mask register. This opcode sequence 
//   partitions the ymmr index register into 8 separate 32 bit signed indexes 
//   to generate 8 different source addresses.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VPGATHERDD,
//   // if XMM0[31] is 1 then   
//   //  [RAX + YMM2[31:0]][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[63:32]][63:32] -> XMM1[63:32]
//   // else
//   //  0 -> XMM1[63:32]
//
//   // if XMM0[95] is 1 then   
//   //  [RAX + YMM2[95:64]][95:64] -> XMM1[95:64]
//   // else
//   //  0 -> XMM1[95:64]
// 
//   // if XMM0[127] is 1 then   
//   //  [RAX + YMM2[127:96]][127:96] -> XMM1[127:96]
//   // else
//   //  0 -> XMM1[127:96]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VPGATHERDD,
//   // if XMM0[31] is 1 then   
//   //  [RAX + (2*YMM2[31:0]) - 8][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[13:0]
//
//    // if XMM0[63] is 1 then   
//    //  [RAX + (2*YMM2[63:32]) - 8][63:32] -> XMM1[63:32]
//    // else
//    //  0 -> XMM1[63:32]
// 
//    // if XMM0[95] is 1 then   
//    //  [RAX + (2*YMM2[95:64]) - 8][95:64] -> XMM1[95:64]
//    // else
//    //  0 -> XMM1[95:64]
//
//    // if XMM0[127] is 1 then   
//    //  [RAX + (2*YMM2[127:96]) - 8][127:96] -> XMM1[127:96]
//    // else
//    //  0 -> XMM1[127:96]
//
//    // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VPGATHERDD,
//   // if YMM0[31] is 1 then   
//   //  [RAX + YMM2[31:0]][31:0] -> YMM1[31:0]
//   // else
//   //  0 -> YMM1[31:0]
//
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[63:32]][63:32] -> YMM1[63:32]
//   // else
//   //  0 -> YMM1[63:32]
//
//   // if YMM0[95] is 1 then   
//   //  [RAX + YMM2[95:64]][95:64] -> YMM1[95:64]
//   // else
//   //  0 -> YMM1[95:64]
// 
//   // ...
// 
//   // if YMM0[255] is 1 then   
//   //  [RAX + YMM2[127:96]][255:224] -> YMM1[255:224]
//   // else
//   //  0 -> YMM1[255:224]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 4 or 8 32 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpgatherdqcomma ( VPGATHERDQ, )
//
// C prototype:
//  void dg_forthvpgatherdqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                 (scale*(indexregister[31:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[63:32]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[95:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:96]) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies a memory target at the address at
//                                 baseregister + 
//                                 (scale*(indexregister[31:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[63:32]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[95:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:96]) +
//                                 displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPGATHERDQ instruction. This opcode sequence copies each of 2 or 4
//   64 bit values from the sources if the high bit of
//   corresponding 64 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 64 bit value in the mask
//   is clear, the corresponding 64 bit value in the destination is left
//   unchanged. This opcode sequence partitions the lower 128 bits of the ymmr  
//   index register into 4 separate 32 bit signed indexes to generate 4
//   different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VPGATHERDD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[31:0]][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + YMM2[63:32]][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VPGATHERDD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[31:0]) - 8][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//   
//   // if XMM0[127] is 1 then   
//   //  [RAX + (2*YMM2[63:32]) - 8][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VPGATHERDD,
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[31:0]][63:0] -> YMM1[63:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[63:32]][127:64] -> YMM1[127:64]
//   // else
//   //  0 -> YMM1[127:64]
//
//   // if YMM0[191] is 1 then   
//   //  [RAX + YMM2[95:64]][191:128] -> YMM1[191:128]
//   // else
//   //  0 -> YMM1[191:128]
//
//   // if YMM0[255] is 1 then   
//   //  [RAX + YMM2[127:96]][255:192] -> YMM1[255:192]
//   // else
//   //  0 -> YMM1[255:192]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 2 or 4 64 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
///////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpgatherqdcomma ( VPGATHERQD, )
//
// C prototype:
//  void dg_forthvpgatherqdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                 (scale*(indexregister[63:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:192]) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies a memory target at the address at
//                                 baseregister + 
//                                 (scale*(indexregister[63:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:192]) +
//                                 displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPGATHERQD instruction. This opcode sequence copies each of 2 or 4
//   32 bit values from the sources if the high bit of corresponding 32 bit
//   value in the mask is set and puts the values into the destination. If the 
//   high bit of the corresponding 32 bit value in the mask is clear, the 
//   corresponding 32 bit value in the destination is left unchanged. 
//   This opcode sequence also clears the mask register. This opcode sequence 
//   partitions the ymmr index register into 4 separate 64 bit signed indexes 
//   to generate 4 different source addresses.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VPGATHERQD,
//   // if XMM0[31] is 1 then   
//   //  [RAX + YMM2[63:0]][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[31:0]
//
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[127:64]][63:32] -> XMM1[63:32]
//   // else
//   //  0 -> XMM1[63:32]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VPGATHERQD,
//   // if XMM0[31] is 1 then   
//   //  [RAX + (2*YMM2[63:0]) - 8][31:0] -> XMM1[31:0]
//   // else
//   //  0 -> XMM1[13:0]
//
//    // if XMM0[63] is 1 then   
//    //  [RAX + (2*YMM2[127:64]) - 8][63:32] -> XMM1[63:32]
//    // else
//    //  0 -> XMM1[63:32]
//
//    // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VPGATHERQD,
//   // if YMM0[31] is 1 then   
//   //  [RAX + YMM2[63:0]][31:0] -> YMM1[31:0]
//   // else
//   //  0 -> YMM1[31:0]
//
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[127:64]][63:32] -> YMM1[63:32]
//   // else
//   //  0 -> YMM1[63:32]
//
//   // if YMM0[95] is 1 then   
//   //  [RAX + YMM2[191:128]][95:64] -> YMM1[95:64]
//   // else
//   //  0 -> YMM1[95:64]

//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[255:192]][255:224] -> YMM1[255:224]
//   // else
//   //  0 -> YMM1[127:96]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 2 or 4 32 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpgatherqqcomma ( VPGATHERQQ, )
//
// C prototype:
//  void dg_forthvpgatherqqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetxparameterlist or targetyparameterlist can contain 
//   these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetyparameterlist or targetzparameterlist can
//   contain this addressing mode specifiers:
//
//   baseregister scale indexregister displacement [R+S*YMMR+N]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister scale indexregister displacement minimumdisplacementsize [VSIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   [R+S*YMMR+N]                 specifies memory targets at the addresses at
//                                 baseregister + 
//                                 (scale*(indexregister[63:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:192]) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [VSIB]                       specifies a memory target at the address at
//                                 baseregister + 
//                                 (scale*(indexregister[63:0]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[127:64]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[191:128]) +
//                                 displacement (displacement is signed)
//                                 baseregister + 
//                                 (scale*(indexregister[255:192]) +
//                                 displacement (displacement is signed)
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPGATHERQQ instruction. This opcode sequence copies each of 2 or 4
//   64 bit values from the sources if the high bit of
//   corresponding 64 bit value in the mask is set and puts the values into the
//   destination.  If the high bit of the corresponding 64 bit value in the mask
//   is clear, the corresponding 64 bit value in the destination is left
//   unchanged. This opcode sequence partitions the ymmr index register into 4 
//   separate 64 bit signed indexes to generate 4 different source addresses. 
//   This opcode sequence also clears the mask register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  XMM0  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  XMM1  VPGATHERDD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + YMM2[63:0]][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//
//   // if XMM0[127] is 1 then   
//   //  [RAX + YMM2[127:64]][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  XMM0  RAX SCALE2* YMM2 -8 [R+S*YMMR+N]  XMM1  VPGATHERDD,
//   // if XMM0[63] is 1 then   
//   //  [RAX + (2*YMM2[63:0]) - 8][63:0] -> XMM1[63:0]
//   // else
//   //  0 -> XMM1[63:0]
//   
//   // if XMM0[127] is 1 then   
//   //  [RAX + (2*YMM2[127:64]) - 8][127:64] -> XMM1[127:64]
//   // else
//   //  0 -> XMM1[127:64]
//
//   // 0 -> XMM0
//
//  YMM1 <-  RAX SCALE1* YMM2 0 [R+S*YMMR+N]  YMM0  VPGATHERDD,
//   // if YMM0[63] is 1 then   
//   //  [RAX + YMM2[63:0]][63:0] -> YMM1[63:0]
//   // else
//   //  0 -> YMM1[63:0]
//
//   // if YMM0[127] is 1 then   
//   //  [RAX + YMM2[127:64]][127:64] -> YMM1[127:64]
//   // else
//   //  0 -> YMM1[127:64]
//
//   // if YMM0[191] is 1 then   
//   //  [RAX + YMM2[191:128]][191:128] -> YMM1[191:128]
//   // else
//   //  0 -> YMM1[191:128]
//
//   // if YMM0[255] is 1 then   
//   //  [RAX + YMM2[255:192]][255:192] -> YMM1[255:192]
//   // else
//   //  0 -> YMM1[255:192]
//
//   // 0 -> YMM0
//
// Note:
//  The source target must be a vsib memory target.
//  The destination target can be either an xmm or ymm register and determines
//   whether 2 or 4 64 bit values are copied if their corresponding mask bits 
//   are set.
//  The mask register must be an xmm or ymm register and should be the same
//   kind of register as the source, but if it isn't, the mask register's size
//   is ignored.
//  The index register is supposed to be a ymm register but it's size is also
//   ignored, so an XMM register will work.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvinsertf128comma ( VINSERTF128, )
//
// C prototype:
//  void dg_forthvinsertf128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VINSERTF128 instruction. This opcode sequence copies a 128 bit value
//   from xmm register or memory source and puts it into either the lower or
//   upper 128 bits of the destination ymm register depending on whether bit 0 
/    of the immediate target is a 0 or 1.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  YMM0  VINSERTF128,     // [RBX][127:0]   -> YMM0[127:0]
//
//  01 N  RBX [R]  YMM0  VINSERTF128,     // [RBX][255:128] -> YMM0[127:0]
//
//  01 N  YMM0 <-  XMM2   VINSERTF128, // XMM2[255:128]  -> YMM0[127:0]
//
//  01 N  XMM8  YMM0  VINSERTF128,        // XMM8[255:128]  -> YMM0[127:0]
//
// Note:
//  Only 1 target can be a memory target. The source target must be an xmm
//   register or memory. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0. The source must be a ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvinserti128comma ( VINSERTI128, )
//
// C prototype:
//  void dg_forthvinserti128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetwparameterlist targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetwparameterlist
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for target w can contain these addressing mode
//   specifiers:
//
//   immediatevalue N
//   immediatevalue 1 IMMEDIATE
//
//  The parameter list for these targets x and y can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter lists for targets x and y can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   immediatevalue               a number from 0 to 255
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//
//   N                            specifies an immediate target. The value
//                                 for this target is encoded into the
//                                 opcode sequence using the smallest size
//                                 possible. (Byte only for this instruction.)
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   IMMEDIATE                    specifies an immediate target.
//                                 The value for this target is encoded into
//                                 the opcode sequence.
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VINSERTI128 instruction. This opcode sequence copies a 128 bit value
//   from xmm register or memory source and puts it into either the lower or
//   upper 128 bits of the destination ymm register depending on whether bit 0 
/    of the immediate target is a 0 or 1.
//
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  00 N  RBX [R]  YMM0  VINSERTI128,     // [RBX][127:0]   -> YMM0[127:0]
//
//  01 N  RBX [R]  YMM0  VINSERTI128,     // [RBX][255:128] -> YMM0[127:0]
//
//  01 N  YMM0 <-  XMM2   VINSERTI128, // XMM2[255:128]  -> YMM0[127:0]
//
//  01 N  XMM8  YMM0  VINSERTI128,        // XMM8[255:128]  -> YMM0[127:0]
//
// Note:
//  Only 1 target can be a memory target. The source target must be an xmm
//   register or memory. The first target must be the immediate target. If you use
//   -> it must come after a memory, xmm or ymm target. The immediate target's
//   size is one byte, so if you use IMMEDIATE to specify a minimum size, it
//   must be 1 or 0. The source must be a ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpbroadcastbcomma ( VPBROADCASTB, )
//
// C prototype:
//  void dg_forthvpbroadcastbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPBROADCASTB instruction. This opcode sequence copies a byte from
//   the xmm register, ymm register, or memory target source and puts a copy 
//   of that byte into each byte of the xmm or ymm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VPBROADCASTB,  // [RBX][7:0] -> XMM0[7:0]
//                                // [RBX][7:0] -> XMM0[15:8]
//                                // ...
//                                // [RBX][7:0] -> XMM0[127:120]
//
//  XMM2  XMM0  VPBROADCASTB,     // XMM2[7:0] -> XMM0[7:0]
//                                // XMM2[7:0] -> XMM0[15:8]
//                                // ...
//                                // XMM2[7:0] -> XMM0[127:120]
//
//  XMM0 <- XMM2  VPBROADCASTB, // XMM2[7:0] -> XMM0[7:0]
//                                 // XMM2[7:0] -> XMM0[15:8]
//                                 // ...
//                                 // XMM2[7:0] -> XMM0[127:120]
//
//  YMM2  YMM0  VPBROADCASTB,     // YMM2[7:0] -> YMM0[7:0]
//                                // YMM2[7:0] -> YMM0[15:8]
//                                // ...
//                                // YMM2[7:0] -> YMM0[255:248]
//
//
// Note:
//  Only 1 target can be a memory target. The destination must be an xmm or ymm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpbroadcastwcomma ( VPBROADCASTW, )
//
// C prototype:
//  void dg_forthvpbroadcastwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPBROADCASTW instruction. This opcode sequence copies a 16 bit
//   value from the xmm register, ymm register, or memory target source and 
//   puts a copy  of that 16 bit value into each 16 bit section of the xmm or 
//   ymm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VPBROADCASTW,  // [RBX][15:0] -> XMM0[15:0]
//                                // [RBX][15:0] -> XMM0[31:16]
//                                // ...
//                                // [RBX][15:0] -> XMM0[127:112]
//
//  XMM2  XMM0  VPBROADCASTW,     // XMM2[15:0] -> XMM0[15:0]
//                                // XMM2[15:0] -> XMM0[31:16]
//                                // ...
//                                // XMM2[15:0] -> XMM0[127:112]
//
//  XMM0 <- XMM2  VPBROADCASTW, // XMM2[15:0] -> XMM0[15:0]
//                                 // XMM2[15:0] -> XMM0[31:16]
//                                 // ...
//                                 // XMM2[15:0] -> XMM0[127:112]
//
//  YMM2  YMM0  VPBROADCASTW,     // YMM2[15:0] -> YMM0[15:0]
//                                // YMM2[15:0] -> YMM0[31:16]
//                                // ...
//                                // YMM2[15:0] -> YMM0[255:240]
//
//
// Note:
//  Only 1 target can be a memory target. The destination must be an xmm or ymm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpbroadcastdcomma ( VPBROADCASTD, )
//
// C prototype:
//  void dg_forthvpbroadcastdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPBROADCASTD instruction. This opcode sequence copies a 32 bit
//   value from the xmm register, ymm register, or memory target source and 
//   puts a copy  of that 32 bit value into each 32 bit section of the xmm or 
//   ymm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VPBROADCASTD,  // [RBX][31:0] -> XMM0[31:0]
//                                // [RBX][31:0] -> XMM0[63:32]
//                                // ...
//                                // [RBX][31:0] -> XMM0[127:96]
//
//  XMM2  XMM0  VPBROADCASTD,     // XMM2[31:0] -> XMM0[31:0]
//                                // XMM2[31:0] -> XMM0[63:32]
//                                // ...
//                                // XMM2[31:0] -> XMM0[127:96]
//
//  XMM0 <- XMM2  VPBROADCASTD, // XMM2[31:0] -> XMM0[31:0]
//                                 // XMM2[31:0] -> XMM0[63:32]
//                                 // ...
//                                 // XMM2[31:0] -> XMM0[127:96]
//
//  YMM2  YMM0  VPBROADCASTD,     // YMM2[31:0] -> YMM0[31:0]
//                                // YMM2[31:0] -> YMM0[63:32]
//                                // ...
//                                // YMM2[31:0] -> YMM0[255:224]
//
//
// Note:
//  Only 1 target can be a memory target. The destination must be an xmm or ymm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpbroadcastqcomma ( VPBROADCASTQ, )
//
// C prototype:
//  void dg_forthvpbroadcastqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   YMMR                         specifies an ymmr register target.
//                                 YMMR is optional but can force the use of
//                                 3 byte vex encoding.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPBROADCASTQ instruction. This opcode sequence copies a 64 bit
//   value from the xmm register, ymm register, or memory target source and 
//   puts a copy  of that 64 bit value into each 64 bit section of the xmm or 
//   ymm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VPBROADCASTQ,  // [RBX][63:0] -> XMM0[63:0]
//                                // [RBX][63:0] -> XMM0[127:64]
//
//  XMM2  XMM0  VPBROADCASTQ,     // XMM2[63:0] -> XMM0[63:0]
//                                // XMM2[63:0] -> XMM0[127:64]
//
//  XMM0 <- XMM2  VPBROADCASTQ, // XMM2[63:0] -> XMM0[63:0]
//                                 // XMM2[63:0] -> XMM0[127:64]
//
//  YMM2  YMM0  VPBROADCASTQ,     // YMM2[63:0] -> YMM0[63:0]
//                                // YMM2[63:0] -> YMM0[127:64]
//                                // ...
//                                // YMM2[63:0] -> YMM0[255:192]
//
//
// Note:
//  Only 1 target can be a memory target. The destination must be an xmm or ymm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpbroadcasti128comma ( VPBROADCASTI128, )
//
// C prototype:
//  void dg_forthvpbroadcasti128comma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   YMMR                         specifies an ymmr register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VPBROADCASTI128 instruction. This opcode sequence copies a 128 bit
//   value from the memory target source and puts a copy  of that 128 bit value 
//   into each 128 bit section of the ymm register destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  YMM0  VPBROADCASTI128,  // [RBX][127:0] -> YMM0[127:0]
//                                   // [RBX][127:0] -> YMM0[255:128]
//
//  YMM0 <- RBX [R]  VPBROADCASTI128, // [RBX][127:0] -> YMM0[127:0]
//                                       // [RBX][127:0] -> YMM0[255:128]
//
//
// Note:
//  The source must be a memory target. The destination must be a ymm
//   register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpeqbcomma ( VPCMPEQB, )
//
// C prototype:
//  void dg_forthvpcmpeqbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPEQB instruction. This opcode sequence compares each byte in
//   target y with the corresponding byte in the source and if they are equal it
//   sets all the bits in the corresponding byte in the destination. If the byte
//   from target y does not equal the byte in the source, the destination is
//   cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPEQB
//   if (XMM1[7:0] == [RBX][7:0]) then 
//    FF -> XMM0[7:0]
//   else  
//    0 -> XMM0[7:0]
//   if (XMM1[15:8] == [RBX][15:8]) then 
//    FF -> XMM0[15:8]
//   else  
//    0 -> XMM0[15:8]
//   ...
//   if (XMM1[127:120] == [RBX][127:120]) then 
//    FF -> XMM0[127:120]
//   else  
//    0 -> XMM0[127:120]
//
//  YMM2  YMM1  YMM0  VPCMPEQB
//   if (YMM1[7:0] == YMM2[7:0]) then 
//    FF -> YMM0[7:0]
//   else  
//    0 -> YMM0[7:0]
//   if (YMM1[15:8] == YMM2[15:8]) then 
//    FF -> YMM0[15:8]
//   else  
//    0 -> YMM0[15:8]
//   ...
//   if (YMM1[255:248] == YMM2[255:248]) then 
//    FF -> YMM0[255:248]
//   else  
//    0 -> YMM0[255:248]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpeqwcomma ( VPCMPEQW, )
//
// C prototype:
//  void dg_forthvpcmpeqwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPEQW instruction. This opcode sequence compares each 16 bit value 
//   in target y with the corresponding 16 bit value in the source and if they 
//   are equal it sets all the bits in the corresponding 16 bit value in the 
//   destination. If the 16 bit value from target y does not equal the 16 bit
//   value in the source, the corresponding 16 bit value in the destination is 
//   cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPEQW
//   if (XMM1[15:0] == [RBX][15:0]) then 
//    FFFF -> XMM0[15:0]
//   else  
//    0 -> XMM0[15:0]
//   if (XMM1[31:16] == [RBX][31:16]) then 
//    FFFF -> XMM0[31:16]
//   else  
//    0 -> XMM0[31:16]
//   ...
//   if (XMM1[127:112] == [RBX][127:112]) then 
//    FFFF -> XMM0[127:112]
//   else  
//    0 -> XMM0[127:112]
//
//  YMM2  YMM1  YMM0  VPCMPEQW
//   if (YMM1[15:0] == YMM2[15:0]) then 
//    FFFF -> YMM0[15:0]
//   else  
//    0 -> YMM0[15:0]
//   if (YMM1[31:16] == YMM2[31:16]) then 
//    FFFF -> YMM0[31:16]
//   else  
//    0 -> YMM0[31:16]
//   ...
//   if (YMM1[255:240] == YMM2[255:240]) then 
//    FFFF -> YMM0[255:240]
//   else  
//    0 -> YMM0[255:240]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpeqdcomma ( VPCMPEQD, )
//
// C prototype:
//  void dg_forthvpcmpeqdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPEQD instruction. This opcode sequence compares each 32 bit value 
//   in target y with the corresponding 32 bit value in the source and if they 
//   are equal it sets all the bits in the corresponding 32 bit value in the 
//   destination. If the 32 bit value from target y does not equal the 32 bit
//   value in the source, the corresponding 32 bit value in the destination is 
//   cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPEQD
//   if (XMM1[31:0] == [RBX][31:0]) then 
//    FFFFFFFF -> XMM0[31:0]
//   else  
//    0 -> XMM0[31:0]
//   if (XMM1[63:32] == [RBX][63:32]) then 
//    FFFFFFFF -> XMM0[63:32]
//   else  
//    0 -> XMM0[63:32]
//   ...
//   if (XMM1[127:96] == [RBX][127:96]) then 
//    FFFFFFFF -> XMM0[127:96]
//   else  
//    0 -> XMM0[127:96]
//
//  YMM2  YMM1  YMM0  VPCMPEQD
//   if (YMM1[31:0] == YMM2[31:0]) then 
//    FFFFFFFF -> YMM0[31:0]
//   else  
//    0 -> YMM0[31:0]
//   if (YMM1[63:32] == YMM2[63:32]) then 
//    FFFFFFFF -> YMM0[63:32]
//   else  
//    0 -> YMM0[63:32]
//   ...
//   if (YMM1[255:224] == YMM2[255:224]) then 
//    FFFFFFFF -> YMM0[255:224]
//   else  
//    0 -> YMM0[255:224]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpeqqcomma ( VPCMPEQQ, )
//
// C prototype:
//  void dg_forthvpcmpeqqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPEQQ instruction. This opcode sequence compares each byte in
//   target y with the corresponding byte in the source and if they are equal it
//   sets all the bits in the corresponding byte in the destination. If the byte
//   from target y does not equal the byte in the source, the destination is
//   cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPEQQ
//   if (XMM1[63:0] == [RBX][63:0]) then 
//    FFFFFFFFFFFFFFFF -> XMM0[63:0]
//   else  
//    0 -> XMM0[63:0]
//   if (XMM1[127:64] == [RBX][127:64]) then 
//    FFFFFFFFFFFFFFFF -> XMM0[127:64]
//   else  
//    0 -> XMM0[127:64]
//
//  YMM2  YMM1  YMM0  VPCMPEQQ
//   if (YMM1[63:0] == YMM2[63:0]) then 
//    FFFFFFFFFFFFFFFF -> YMM0[63:0]
//   else  
//    0 -> YMM0[63:0]
//   if (YMM1[127:64] == YMM2[127:64]) then 
//    FFFFFFFFFFFFFFFF -> YMM0[127:64]
//   else  
//    0 -> YMM0[127:64]
//   ...
//   if (YMM1[255:191] == YMM2[255:191]) then 
//    FFFFFFFFFFFFFFFF -> YMM0[255:191]
//   else  
//    0 -> YMM0[255:191]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpgtbcomma ( VPCMPGTB, )
//
// C prototype:
//  void dg_forthvpcmpgtbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPGTB instruction. This opcode sequence compares each signed byte 
//   in target y with the corresponding signed byte in the source and if they  
//   are greater it sets all the bits in the corresponding byte in the 
//   destination. If the byte from target y is not greater than the byte in the 
//   source, the destination is cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPGTB
//   if (XMM1[7:0] > [RBX][7:0]) then 
//    FF -> XMM0[7:0]
//   else  
//    0 -> XMM0[7:0]
//   if (XMM1[15:8] > [RBX][15:8]) then 
//    FF -> XMM0[15:8]
//   else  
//    0 -> XMM0[15:8]
//   ...
//   if (XMM1[127:120] > [RBX][127:120]) then 
//    FF -> XMM0[127:120]
//   else  
//    0 -> XMM0[127:120]
//
//  YMM2  YMM1  YMM0  VPCMPGTB
//   if (YMM1[7:0] > YMM2[7:0]) then 
//    FF -> YMM0[7:0]
//   else  
//    0 -> YMM0[7:0]
//   if (YMM1[15:8] > YMM2[15:8]) then 
//    FF -> YMM0[15:8]
//   else  
//    0 -> YMM0[15:8]
//   ...
//   if (YMM1[255:248] > YMM2[255:248]) then 
//    FF -> YMM0[255:248]
//   else  
//    0 -> YMM0[255:248]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpgtwcomma ( VPCMPGTW, )
//
// C prototype:
//  void dg_forthvpcmpgtwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPGTW instruction. This opcode sequence compares each signed 16 
//   bit value in target y with the corresponding signed 16 bit value in the 
//   source and if they are greater it sets all the bits in the corresponding  
//   16 bit value in the destination. If the 16 bit value from target y is not 
//   greater than the 16 bit value in the source, the corresponding 16 bit value   
//   in the destination is cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPGTW
//   if (XMM1[15:0] > [RBX][15:0]) then 
//    FFFF -> XMM0[15:0]
//   else  
//    0 -> XMM0[15:0]
//   if (XMM1[31:16] > [RBX][31:16]) then 
//    FFFF -> XMM0[31:16]
//   else  
//    0 -> XMM0[31:16]
//   ...
//   if (XMM1[127:112] > [RBX][127:112]) then 
//    FFFF -> XMM0[127:112]
//   else  
//    0 -> XMM0[127:112]
//
//  YMM2  YMM1  YMM0  VPCMPGTW
//   if (YMM1[15:0] > YMM2[15:0]) then 
//    FFFF -> YMM0[15:0]
//   else  
//    0 -> YMM0[15:0]
//   if (YMM1[31:16] > YMM2[31:16]) then 
//    FFFF -> YMM0[31:16]
//   else  
//    0 -> YMM0[31:16]
//   ...
//   if (YMM1[255:240] > YMM2[255:240]) then 
//    FFFF -> YMM0[255:240]
//   else  
//    0 -> YMM0[255:240]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpgtdcomma ( VPCMPGTD, )
//
// C prototype:
//  void dg_forthvpcmpgtdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPGTD instruction. This opcode sequence compares each signed 32 
//   bit value in target y with the corresponding signed 32 bit value in the 
//   source and if they are greater it sets all the bits in the corresponding 32 
//   bit value in the destination. If the 32 bit value from target y is not 
//   greater than the 32 bit value in the source, the corresponding 32 bit value 
//   in the destination is cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPGTD
//   if (XMM1[31:0] > [RBX][31:0]) then 
//    FFFFFFFF -> XMM0[31:0]
//   else  
//    0 -> XMM0[31:0]
//   if (XMM1[63:32] > [RBX][63:32]) then 
//    FFFFFFFF -> XMM0[63:32]
//   else  
//    0 -> XMM0[63:32]
//   ...
//   if (XMM1[127:96] > [RBX][127:96]) then 
//    FFFFFFFF -> XMM0[127:96]
//   else  
//    0 -> XMM0[127:96]
//
//  YMM2  YMM1  YMM0  VPCMPGTD
//   if (YMM1[31:0] > YMM2[31:0]) then 
//    FFFFFFFF -> YMM0[31:0]
//   else  
//    0 -> YMM0[31:0]
//   if (YMM1[63:32] > YMM2[63:32]) then 
//    FFFFFFFF -> YMM0[63:32]
//   else  
//    0 -> YMM0[63:32]
//   ...
//   if (YMM1[255:224] > YMM2[255:224]) then 
//    FFFFFFFF -> YMM0[255:224]
//   else  
//    0 -> YMM0[255:224]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpcmpgtqcomma ( VPCMPGTQ, )
//
// C prototype:
//  void dg_forthvpcmpgtqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPCMPGTQ instruction. This opcode sequence compares signed 64 bit 
//   value in target y with the corresponding signed 64 bit value in the source 
//   and if it is greater it sets all the bits in the corresponding 64 bit value
//   in the destination. If the 64 bit value from from target y is not greater
//   than the 64 bit value in the source, the destination is cleared.
//   
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM1  XMM0  VPCMPGTQ
//   if (XMM1[63:0] > [RBX][63:0]) then 
//    FFFFFFFFFFFFFFFF -> XMM0[63:0]
//   else  
//    0 -> XMM0[63:0]
//   if (XMM1[127:64] > [RBX][127:64]) then 
//    FFFFFFFFFFFFFFFF -> XMM0[127:64]
//   else  
//    0 -> XMM0[127:64]
//
//  YMM2  YMM1  YMM0  VPCMPGTQ
//   if (YMM1[63:0] > YMM2[63:0]) then 
//    FFFFFFFFFFFFFFFF -> YMM0[63:0]
//   else  
//    0 -> YMM0[63:0]
//   if (YMM1[127:64] > YMM2[127:64]) then 
//    FFFFFFFFFFFFFFFF -> YMM0[127:64]
//   else  
//    0 -> YMM0[127:64]
//   ...
//   if (YMM1[255:191] > YMM2[255:191]) then 
//    FFFFFFFFFFFFFFFF -> YMM0[255:191]
//   else  
//    0 -> YMM0[255:191]
//
// Note:
//  Only the source can be a memory target. The destination target and target y
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsllvdcomma ( VPSLLVD, )
//
// C prototype:
//  void dg_forthvpsllvdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSLLVD instruction. The sequence shifts each 32 bit value in 
//   target y left logically by the number of bits in each 32 bit section of the
//   source and puts the results into the destination. Logically shifting means
//   0s are shifted in from the right.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSLLVD, 
//                            // XMM0[31:0] << [RBX][31:0]
//                            //  -> XMM1[31:0]
//                            // XMM0[63:32] << [RBX][63:32]
//                            //  -> XMM1[63:32]
//                            // XMM0[95:64] << [RBX][95:64]
//                            //  -> XMM1[95:64]
//                            // XMM0[127:96] << [RBX][127:96]
//                            //  -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPSLLVD,  
//                            // YMM0[31:0] <<   YMM2[31:0]
//                            //  -> YMM1[31:0]
//                            // YMM0[63:32] <<  YMM2[63:32]
//                            //  -> YMM1[63:32]
//                            // YMM0[95:64] <<  YMM2[95:64]
//                            //  -> YMM1[95:64]
//                            // YMM0[127:96] << YMM2[127:96]
//                            //  -> YMM1[127:96]
//                            // YMM0[159:128] <<  YMM2[159:128]
//                            //  -> YMM1[159:128]
//                            // YMM0[191:160] <<  YMM2[191:160]
//                            //  -> YMM1[191:160]
//                            // YMM0[223:192] <<  YMM2[223:192]
//                            //  -> YMM1[223:192]
//                            // YMM0[255:224] << YMM2[255:224]
//                            //  -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPSLLVD, 
//                            // XMM0[31:0] << XMM2[31:0]
//                            //  -> XMM1[31:0]
//                            // XMM0[63:32] << XMM2[63:32]
//                            //  -> XMM1[63:32]
//                            // XMM0[95:64] << XMM2[95:64]
//                            //  -> XMM1[95:64]
//                            // XMM0[127:96] << XMM2[127:96]
//                            //  -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsllvqcomma ( VPSLLVQ, )
//
// C prototype:
//  void dg_forthvpsllvqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSLLVQ instruction. The sequence shifts each 64 bit value in 
//   target y left logically by the number of bits in each 64 bit section of the
//   source and puts the results into the destination. Logically shifting means
//   0s are shifted in from the right.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSLLVQ, 
//                            // XMM0[63:0] << [RBX][63:0]
//                            //  -> XMM1[63:0]
//                            // XMM0[127:64] << [RBX][127:64]
//                            //  -> XMM1[127:64]
//
//  YMM2  YMM0  YMM1  VPSLLVQ,  
//                            // YMM0[63:0] <<   YMM2[63:0]
//                            //  -> YMM1[63:0]
//                            // YMM0[127:64] <<  YMM2[127:64]
//                            //  -> YMM1[127:64]
//                            // YMM0[191:128] <<  YMM2[191:128]
//                            //  -> YMM1[191:128]
//                            // YMM0[255:192] << YMM2[255:192]
//                            //  -> YMM1[255:192]
//
//  XMM1 <-  XMM0  XMM2  VPSLLVQ, 
//                            // XMM0[63:0] << XMM2[63:0]
//                            //  -> XMM1[63:0]
//                            // XMM0[127:64] << XMM2[127:64]
//                            //  -> XMM1[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsravdcomma ( VPSRAVD, )
//
// C prototype:
//  void dg_forthvpsravdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRAVD instruction. The sequence shifts each 32 bit value in 
//   target y right arithmetically by the number of bits in each 32 bit section of the
//   source and puts the results into the destination. Logically shifting means
//   copies of the high bit in each section are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSRAVD, 
//                            // XMM0[31:0] >> [RBX][31:0]
//                            //  -> XMM1[31:0]
//                            // XMM0[63:32] >> [RBX][63:32]
//                            //  -> XMM1[63:32]
//                            // XMM0[95:64] >> [RBX][95:64]
//                            //  -> XMM1[95:64]
//                            // XMM0[127:96] >> [RBX][127:96]
//                            //  -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPSRAVD,  
//                            // YMM0[31:0] >>   YMM2[31:0]
//                            //  -> YMM1[31:0]
//                            // YMM0[63:32] >>  YMM2[63:32]
//                            //  -> YMM1[63:32]
//                            // YMM0[95:64] >>  YMM2[95:64]
//                            //  -> YMM1[95:64]
//                            // YMM0[127:96] >> YMM2[127:96]
//                            //  -> YMM1[127:96]
//                            // YMM0[159:128] >>  YMM2[159:128]
//                            //  -> YMM1[159:128]
//                            // YMM0[191:160] >>  YMM2[191:160]
//                            //  -> YMM1[191:160]
//                            // YMM0[223:192] >>  YMM2[223:192]
//                            //  -> YMM1[223:192]
//                            // YMM0[255:224] >> YMM2[255:224]
//                            //  -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPSRAVD, 
//                            // XMM0[31:0] >> XMM2[31:0]
//                            //  -> XMM1[31:0]
//                            // XMM0[63:32] >> XMM2[63:32]
//                            //  -> XMM1[63:32]
//                            // XMM0[95:64] >> XMM2[95:64]
//                            //  -> XMM1[95:64]
//                            // XMM0[127:96] >> XMM2[127:96]
//                            //  -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsrlvdcomma ( VPSRLVD, )
//
// C prototype:
//  void dg_forthvpsrlvdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRLVD instruction. The sequence shifts each 32 bit value in 
//   target y right logically by the number of bits in each 32 bit section of the
//   source and puts the results into the destination. Logically shifting means
//   0s are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSRLVD, 
//                            // XMM0[31:0] >> [RBX][31:0]
//                            //  -> XMM1[31:0]
//                            // XMM0[63:32] >> [RBX][63:32]
//                            //  -> XMM1[63:32]
//                            // XMM0[95:64] >> [RBX][95:64]
//                            //  -> XMM1[95:64]
//                            // XMM0[127:96] >> [RBX][127:96]
//                            //  -> XMM1[127:96]
//
//  YMM2  YMM0  YMM1  VPSLLVD,  
//                            // YMM0[31:0] >>   YMM2[31:0]
//                            //  -> YMM1[31:0]
//                            // YMM0[63:32] >>  YMM2[63:32]
//                            //  -> YMM1[63:32]
//                            // YMM0[95:64] >>  YMM2[95:64]
//                            //  -> YMM1[95:64]
//                            // YMM0[127:96] >> YMM2[127:96]
//                            //  -> YMM1[127:96]
//                            // YMM0[159:128] >>  YMM2[159:128]
//                            //  -> YMM1[159:128]
//                            // YMM0[191:160] >>  YMM2[191:160]
//                            //  -> YMM1[191:160]
//                            // YMM0[223:192] >>  YMM2[223:192]
//                            //  -> YMM1[223:192]
//                            // YMM0[255:224] >> YMM2[255:224]
//                            //  -> YMM1[255:224]
//
//  XMM1 <-  XMM0  XMM2  VPSRLVD, 
//                            // XMM0[31:0] >> XMM2[31:0]
//                            //  -> XMM1[31:0]
//                            // XMM0[63:32] >> XMM2[63:32]
//                            //  -> XMM1[63:32]
//                            // XMM0[95:64] >> XMM2[95:64]
//                            //  -> XMM1[95:64]
//                            // XMM0[127:96] >> XMM2[127:96]
//                            //  -> XMM1[127:96]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsrlvqcomma ( VPSRLVQ, )
//
// C prototype:
//  void dg_forthvpsrlvqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSRLVQ instruction. The sequence shifts each 64 bit value in 
//   target y right logically by the number of bits in each 64 bit section of the
//   source and puts the results into the destination. Logically shifting means
//   0s are shifted in from the left.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSRLVQ, 
//                            // XMM0[63:0] >> [RBX][63:0]
//                            //  -> XMM1[63:0]
//                            // XMM0[127:64] >> [RBX][127:64]
//                            //  -> XMM1[127:64]
//
//  YMM2  YMM0  YMM1  VPSRLVQ,  
//                            // YMM0[63:0] >>   YMM2[63:0]
//                            //  -> YMM1[63:0]
//                            // YMM0[127:64] >>  YMM2[127:64]
//                            //  -> YMM1[127:64]
//                            // YMM0[191:128] >>  YMM2[191:128]
//                            //  -> YMM1[191:128]
//                            // YMM0[255:192] >> YMM2[255:192]
//                            //  -> YMM1[255:192]
//
//  XMM1 <-  XMM0  XMM2  VPSRLVQ, 
//                            // XMM0[63:0] >> XMM2[63:0]
//                            //  -> XMM1[63:0]
//                            // XMM0[127:64] >> XMM2[127:64]
//                            //  -> XMM1[127:64]
//
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubbcomma ( VPSUBB, )
//
// C prototype:
//  void dg_forthvpsubbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBB instruction. This opcode sequence subtracts each byte value
//   in the source from each byte in target y and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBB,    
//                     // XMM0[7:0]  - [RBX][7:0]  -> XMM1[7:0]
//                     // XMM0[15:8] - [RBX][15:8] -> XMM1[15:8]
//                     // ...
//                     // XMM0[127:120] - [RBX][127:120] -> XMM1[127:120]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBB,       
//                     // YMM0[7:0]  - YMM2[7:0]  -> YMM1[7:0]
//                     // YMM0[15:8] - YMM2[15:8] -> YMM1[15:8]
//                     // ...
//                     // YMM0[255:248] - YMM2[255:248] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPSUBB, 
//                     // XMM0[7:0]  - XMM2[7:0]  -> XMM1[7:0]
//                     // XMM0[15:8] - XMM2[15:8] -> XMM1[15:8]
//                     // ...
//                     // XMM0[127:120] - XMM2[127:120] -> XMM1[127:120]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubwcomma ( VPSUBW, )
//
// C prototype:
//  void dg_forthvpsubwcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBW instruction. This opcode sequence subtracts each 16 bit value
//   in the source from each 16 bit in target y and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBW,    
//                     // XMM0[15:0]  - [RBX][15:0]  -> XMM1[15:0]
//                     // XMM0[31:16] - [RBX][31:16] -> XMM1[31:16]
//                     // ...
//                     // XMM0[127:112] - [RBX][127:112] -> XMM1[127:112]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBW,       
//                     // YMM0[15:0]  - YMM2[15:0]  -> YMM1[15:0]
//                     // YMM0[31:16] - YMM2[31:16] -> YMM1[31:16]
//                     // ...
//                     // YMM0[255:240] - YMM2[255:240] -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPSUBW, 
//                     // XMM0[15:0]  - XMM2[15:0]  -> XMM1[15:0]
//                     // XMM0[31:16] - XMM2[31:16] -> XMM1[31:16]
//                     // ...
//                     // XMM0[127:112] - XMM2[127:112] -> XMM1[127:112]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubdcomma ( VPSUBD, )
//
// C prototype:
//  void dg_forthvpsubdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBD instruction. This opcode sequence subtracts each 32 bit value
//   in the source from each 32 bit in target y and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBD,    
//                     // XMM0[31:0]  - [RBX][31:0]  -> XMM1[31:0]
//                     // XMM0[63:32] - [RBX][63:32] -> XMM1[63:32]
//                     // ...
//                     // XMM0[127:96] - [RBX][127:96] -> XMM1[127:96]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBD,       
//                     // YMM0[31:0]  - YMM2[31:0]  -> YMM1[31:0]
//                     // YMM0[63:32] - YMM2[63:32] -> YMM1[63:32]
//                     // ...
//                     // YMM0[255:240] - YMM2[255:240] -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPSUBD, 
//                     // XMM0[31:0]  - XMM2[31:0]  -> XMM1[31:0]
//                     // XMM0[63:32] - XMM2[63:32] -> XMM1[63:32]
//                     // ...
//                     // XMM0[127:96] - XMM2[127:96] -> XMM1[127:96]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubqcomma ( VPSUBQ, )
//
// C prototype:
//  void dg_forthvpsubqcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBQ instruction. This opcode sequence subtracts each 64 bit value
//   in the source from each 64 bit in target y and puts the results into the
//   destination.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBQ,    
//                     // XMM0[63:0]  - [RBX][63:0]  -> XMM1[63:0]
//                     // XMM0[127:64] - [RBX][127:64] -> XMM1[127:64]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBQ,       
//                     // YMM0[63:0]  - YMM2[63:0]  -> YMM1[63:0]
//                     // YMM0[127:64] - YMM2[127:64] -> YMM1[127:64]
//                     // YMM0[191:128] - YMM2[191:128] -> YMM1[191:128]
//                     // YMM0[255:192] - YMM2[255:192] -> YMM1[255:192]
//
//  XMM1 <-  XMM0  XMM2  VPSUBQ, 
//                     // XMM0[63:0]  - XMM2[63:0]  -> XMM1[63:0]
//                     // XMM0[127:64] - XMM2[127:64] -> XMM1[127:64]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubsbcomma ( VPSUBSB, )
//
// C prototype:
//  void dg_forthvpsubsbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBSB instruction. This opcode sequence subtracts each signed byte 
//   value in the source from each signed byte in target y and puts the results 
//   into the destination. If the result is larger or smaller than what will fit
//   into a signed byte value, the result is clipped to the largest or smallest
//   signed byte value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBSB,    
//                     // XMM0[7:0]  - [RBX][7:0]  -> XMM1[7:0]
//                     // XMM0[15:8] - [RBX][15:8] -> XMM1[15:8]
//                     // ...
//                     // XMM0[127:120] - [RBX][127:120] -> XMM1[127:120]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBSB,       
//                     // YMM0[7:0]  - YMM2[7:0]  -> YMM1[7:0]
//                     // YMM0[15:8] - YMM2[15:8] -> YMM1[15:8]
//                     // ...
//                     // YMM0[255:248] - YMM2[255:248] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPSUBSB, 
//                     // XMM0[7:0]  - XMM2[7:0]  -> XMM1[7:0]
//                     // XMM0[15:8] - XMM2[15:8] -> XMM1[15:8]
//                     // ...
//                     // XMM0[127:120] - XMM2[127:120] -> XMM1[127:120]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubswcomma ( VPSUBSW, )
//
// C prototype:
//  void dg_forthvpsubswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBSW instruction. This opcode sequence subtracts each 16 signed 
//   bit value in the source from each signed 16 bit in target y and puts the 
//   results into the destination. If the result is greater or smaller than what
//   will fit into a signed 16 bit integer, the result is clipped to the largest
//   or smallest 16 bit value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBSW,    
//                     // XMM0[15:0]  - [RBX][15:0]  -> XMM1[15:0]
//                     // XMM0[31:16] - [RBX][31:16] -> XMM1[31:16]
//                     // ...
//                     // XMM0[127:112] - [RBX][127:112] -> XMM1[127:112]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBSW,       
//                     // YMM0[15:0]  - YMM2[15:0]  -> YMM1[15:0]
//                     // YMM0[31:16] - YMM2[31:16] -> YMM1[31:16]
//                     // ...
//                     // YMM0[255:240] - YMM2[255:240] -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPSUBSW, 
//                     // XMM0[15:0]  - XMM2[15:0]  -> XMM1[15:0]
//                     // XMM0[31:16] - XMM2[31:16] -> XMM1[31:16]
//                     // ...
//                     // XMM0[127:112] - XMM2[127:112] -> XMM1[127:112]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubusbcomma ( VPSUBUSB, )
//
// C prototype:
//  void dg_forthvpsubusbcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBUSB instruction. This opcode sequence subtracts each unsigned byte 
//   value in the source from each unsigned byte in target y and puts the results 
//   into the destination. If the result is larger or smaller than what will fit
//   into an unsigned byte value, the result is clipped to the largest or smallest
//   unsigned byte value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBUSB,    
//                     // XMM0[7:0]  - [RBX][7:0]  -> XMM1[7:0]
//                     // XMM0[15:8] - [RBX][15:8] -> XMM1[15:8]
//                     // ...
//                     // XMM0[127:120] - [RBX][127:120] -> XMM1[127:120]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBUSB,       
//                     // YMM0[7:0]  - YMM2[7:0]  -> YMM1[7:0]
//                     // YMM0[15:8] - YMM2[15:8] -> YMM1[15:8]
//                     // ...
//                     // YMM0[255:248] - YMM2[255:248] -> YMM1[255:248]
//
//  XMM1 <-  XMM0  XMM2  VPSUBUSB, 
//                     // XMM0[7:0]  - XMM2[7:0]  -> XMM1[7:0]
//                     // XMM0[15:8] - XMM2[15:8] -> XMM1[15:8]
//                     // ...
//                     // XMM0[127:120] - XMM2[127:120] -> XMM1[127:120]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvpsubuswcomma ( VPSUBUSW, )
//
// C prototype:
//  void dg_forthvpsubuswcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist targetzparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//  targetzparameterlist
//
//
//  The parameter list for targetyparameterlist can contain this addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//
//  The parameter list for targetxparameterlist or targetzparameterlist can
//   contain these addressing mode specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for targetxparameterlist
//   or targetzparameterlist can also contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  If you want to force 3 byte vex encoding, you can use this:
//   3BYTEVEX
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   3BYTEVEX                     forces 3 byte vex encoding, otherwise 2
//                                 byte vex will be used if possible
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls three targets from the data stack and compiles the opcode sequence for
//   an x86 VPSUBUSW instruction. This opcode sequence subtracts each 16 unsigned 
//   bit value in the source from each unsigned 16 bit in target y and puts the 
//   results into the destination. If the result is greater or smaller than what
//   will fit into an unsigned 16 bit integer, the result is clipped to the 
//   largest or smallest 16 bit value possible.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  XMM1  VPSUBSW,    
//                     // XMM0[15:0]  - [RBX][15:0]  -> XMM1[15:0]
//                     // XMM0[31:16] - [RBX][31:16] -> XMM1[31:16]
//                     // ...
//                     // XMM0[127:112] - [RBX][127:112] -> XMM1[127:112]
//                     // 0 -> YMM1[255:128]
//  
//
//  YMM2  YMM0  YMM1  VPSUBSW,       
//                     // YMM0[15:0]  - YMM2[15:0]  -> YMM1[15:0]
//                     // YMM0[31:16] - YMM2[31:16] -> YMM1[31:16]
//                     // ...
//                     // YMM0[255:240] - YMM2[255:240] -> YMM1[255:240]
//
//  XMM1 <-  XMM0  XMM2  VPSUBSW, 
//                     // XMM0[15:0]  - XMM2[15:0]  -> XMM1[15:0]
//                     // XMM0[31:16] - XMM2[31:16] -> XMM1[31:16]
//                     // ...
//                     // XMM0[127:112] - XMM2[127:112] -> XMM1[127:112]
//                     // 0 -> YMM1[255:128]
// Note:
//  Only 1 target can be a memory target. The destination target and target y 
//   must be an xmm or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// /////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvtestpdcomma ( VTESTPD, )
//
// C prototype:
//  void dg_forthvtestpdcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VTESTPD instruction. This opcode sequence does a binary AND of the 
//   sign bits of each double precision floating point value in the source with 
//   both the sign bits of each double precision floating point value in target y 
//   and if all the results of the ANDs are zero, the zero flag is set, otherwise
//   the zero flag is cleared. This opcode sequence also does a binary AND of the
//   the sign bits of each double precision floating point value in the source
//   with the logical inverse of each sign bit of each double precision floating
//   point value in target y and if the all the results of the ANDs are zero, the
//   carry flag is set, otherwise the carry flag is cleared. The destination is
//   not changed. The values in the source and destination do not have to be
//   floating point values. This should also work on signed integer values.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VTESTPD,      
//                       // XMM0[63] AND [RBX][63] AND
//                       //  XMM0[127] AND [RBX][127]  -> ZF
//                       // (not XMM0[63]) AND [RBX][63] AND
//                       //  (not XMM0[127]) AND [RBX][127]  -> CF
//
//  XMM2  XMM0  VTESTPD,  
//                       // XMM0[63] AND XMM2[63] AND
//                       //  XMM0[127] AND XMM2[127]  -> ZF
//                       // (not XMM0[63]) AND XMM2[63] AND
//                       //  (not XMM0[127]) AND XMM2[127]  -> CF
//
//  XMM2 <- XMM0  VTESTPD,    
//                       // XMM0[63] AND XMM2[63] AND
//                       //  XMM0[127] AND XMM2[127]  -> ZF
//                       // (not XMM0[63]) AND XMM2[63] AND
//                       //  (not XMM0[127]) AND XMM2[127]  -> CF
//
//  YMM0  YMM8 VTESTPD,           
//                       // YMM8[63] AND YMM0[63] AND
//                       //  YMM8[127] AND YMM0[127] AND 
//                       //  YMM8[191] AND YMM0[191] AND
//                       //  YMM8[255] AND YMM0[255] -> ZF
//                       // (not YMM8[63]) AND YMM0[63] AND
//                       //  (not YMM8[127]) AND YMM0[127] AND
//                       //  (not YMM8[191]) AND YMM0[191] AND
//                       //  (not YMM8[255]) AND YMM0[255] -> CF
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvtestpscomma ( VTESTPS, )
//
// C prototype:
//  void dg_forthvtestpscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetxmmregister
//   targetymmregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetxmmregister XMMR
//   targetymmregister YMMR
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction, you can use these:
//   ->
//   <-
//
//  Description of target parameters:
//
//   targetxmmregister            in both 32 and 64 bit address mode, one of:
//                                 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7
//                                 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15
//   targetymmregister            in both 32 and 64 bit address mode, one of:
//                                 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7
//                                 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at an
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   XMMR                         specifies an xmmr register target.
//                                 XMMR is optional.
//   YMMR                         specifies a ymmr register target.
//                                 YMMR is optional.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 VTESTPS instruction. This opcode sequence does a binary AND of the 
//   sign bits of each single precision floating point value in the source with 
//   both the sign bits of each single precision floating point value in target y 
//   and if all the results of the ANDs are zero, the zero flag is set, otherwise
//   the zero flag is cleared. This opcode sequence also does a binary AND of the
//   the sign bits of each single precision floating point value in the source
//   with the logical inverse of each sign bit of each single precision floating
//   point value in target y and if the all the results of the ANDs are zero, the
//   carry flag is set, otherwise the carry flag is cleared. The destination is
//   not changed. The values in the source and destination do not have to be
//   floating point values. This should also work on signed integer values.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  RBX [R]  XMM0  VTESTPS, 
//                       // XMM0[31] AND [RBX][31] AND     
//                       //  XMM0[63] AND [RBX][63] AND
//                       //  XMM0[95] AND [RBX][95] AND
//                       //  XMM0[127] AND [RBX][127]  -> ZF
//                       // (not XMM0[31]) AND [RBX][31] AND     
//                       //  (not XMM0[63]) AND [RBX][63] AND
//                       //  (not XMM0[95]) AND [RBX][95] AND
//                       //  (not XMM0[127]) AND [RBX][127]  -> CF
//
//  XMM2  XMM0  VTESTPS,  
//                       // XMM0[31] AND XMM2[31] AND     
//                       //  XMM0[63] AND XMM2[63] AND
//                       //  XMM0[95] AND XMM2[95] AND
//                       //  XMM0[127] AND XMM2[127]  -> ZF
//                       // (not XMM0[31]) AND XMM2[31] AND     
//                       //  (not XMM0[63]) AND XMM2[63] AND
//                       //  (not XMM0[95]) AND XMM2[95] AND
//                       //  (not XMM0[127]) AND XMM2[127]  -> CF
//
//  XMM2 <- XMM0  VTESTPS,    
//                       // XMM0[31] AND XMM2[31] AND     
//                       //  XMM0[63] AND XMM2[63] AND
//                       //  XMM0[95] AND XMM2[95] AND
//                       //  XMM0[127] AND XMM2[127]  -> ZF
//                       // (not XMM0[31]) AND XMM2[31] AND     
//                       //  (not XMM0[63]) AND XMM2[63] AND
//                       //  (not XMM0[95]) AND XMM2[95] AND
//                       //  (not XMM0[127]) AND XMM2[127]  -> CF
//
//  YMM0  YMM8 VTESTPS,           
//                       // YMM8[31] AND YMM0[31] AND     
//                       //  YMM8[63] AND YMM0[63] AND
//                       //  YMM8[95] AND YMM0[95] AND
//                       //  YMM8[127] AND YMM0[127] AND
//                       //  YMM8[159] AND YMM0[159] AND
//                       //  YMM8[191] AND YMM0[191] AND
//                       //  YMM8[233] AND YMM0[223] AND
//                       //  YMM8[255] AND YMM0[255] -> ZF
//                       // (not YMM8[31]) AND YMM0[31] AND     
//                       //  (not YMM8[63]) AND YMM0[63] AND
//                       //  (not YMM8[95]) AND YMM0[95] AND
//                       //  (not YMM8[127]) AND YMM0[127] AND
//                       //  (not YMM8[159]) AND YMM0[159] AND
//                       //  (not YMM8[191]) AND YMM0[191] AND
//                       //  (not YMM8[233]) AND YMM0[223] AND
//                       //  (not YMM8[255]) AND YMM0[255] -> CF
//
// Note:
//  Only 1 target can be a memory target. The destination target must be an xmm
//   or ymm register.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo...
//   you may see some strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvzeroallcomma ( VZEROALL, )
//
// C prototype:
//  void dg_forthvzeroallcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 VZEROALL instruction. 
//  In 32 bit address mode, clears the first 8 YMM registers. 
//  In 64 bit address mode clears all the YMM registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  VZEROALL,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthvzerouppercomma ( VZEROUPPER, )
//
// C prototype:
//  void dg_forthvzerouppercomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//
// Execute state action:
//  Compiles the opcode sequence for an x86 VZEROALL instruction. 
//  In 32 bit address mode, clears the upper half of the first 8 YMM registers. 
//  In 64 bit address mode clears the upper half of all the YMM registers.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Example:
//  VZEROUPPER,
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxabortcomma ( XABORT, )
//
// C prototype:
//  void dg_forthxabortcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is
//                                 the other used as the bufferhandle for the
//                                 array where bufferhandles are stored.
//
// Stack action shorthand:
//  ( immediatetargetparameterlist -- )
//
//  immediatetargetparameterlist can be one of:
//   ( immediatevalue N -- )
//   ( immediatevalue  minimumimmediatesize  IMMEDIATE -- )
//
//   immediatevalue               Value of an immediate target. Only the lower
//                                 8 bits are used.
//   minimumimmediatesize         minimum size of immediate target and is ignored
//                                 for INT,
//   IMMEDIATE                    specifies an immediate target. The value for
//                                 this target is encoded into the opcode
//                                 sequence.
//   N                            specifies an immediate target with minimum
//                                 value of size 0
//
// Execute state action:
//  Pulls immediate target from data stack then compiles the opcode string
//   for the x86 XABORT nn instruction where nn is the lower 8 bits of the
//   immediate target. This opcode sequence checks to see if the processor
//   is executing in an RTM transactional region and if it is then it does
//   an RTM abort, which causes the processor to jump to the address
//   established by the outermost XBEGIN. Then the immediate value is
//   stored in the upper 8 bits of the EAX register, and status information
//   is put into the lower 24 bits of the EAX register.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// 64 bit address mode example:
//  3 N  XABORT,  // if in RTM region then:
//                //  outermostxbeginaddress -> RIP;
//                //  3 -> EAX[31:24]
//                //  status -> EAX[23:0]
//
// Note:
//  I'm not sure what status information is stored into EAX or exactly
//   which bits are used. This is a guess based on what the Intel docs
//   said.
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthxaddcomma ( XADD, )
//
// C prototype:
//  void dg_forthxaddcomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle*           pBHarrayhead  pointer to a Bufferhandle structure
//                                         which is used as the bufferhandle for
//                                         the array where the other bufferhandles
//                                         are stored.
//
// Stack action shorthand:
//
//  ( targetxparameterlist targetyparameterlist -- )
//
// Data stack in:
//
//  targetxparameterlist
//  targetyparameterlist
//
//
//  The parameter list for these targets can contain these addressing mode
//   specifiers:
//
//   targetregister
//   baseregister [R]
//   baseregister displacement [R+N]
//   absoluteaddress [N]
//   baseregister scale indexregister displacement [R+S*R+N]
//
//  In 64 bit addressing mode, the parameter list for a target can also
//   contain this specifier:
//
//   currentcompilebufferoffset [O]
//
//  If you want more control over how the instruction is encoded,
//   you can use these addressing mode specifiers instead:
//
//   targetregister R
//   baseregister displacement minimumdisplacementsize [MOD]
//   baseregister scale indexregister displacement minimumdisplacementsize [SIB]
//
//  If you want to set the direction for modes that allow it, which are modes
//   with no immediate target, you can use these:
//   ->
//   <-
//
//  If you want to set the data size for memory targets, you can use these:
//   (This is optional for this instruction. The compiler can figure it out from
//   the size of the source register.)
//   8BIT
//   16BIT
//   32BIT
//   64BIT
//
//  Alternative way to set the data size:
//   datasizevalue DATASIZE
//
//  Description of target parameters:
//
//   targetregister               in both 32 and 64 bit address mode, one of:
//                                 AL BL CL DL AH BH CH DH
//                                 AX BX CX DX BP SI DI SP
//                                 EAX EBX ECX EDX EBP ESI EDI ESP
//                                in 64 bit address mode, also one of:
//                                 SPL BPL SIL DIL
//                                 RAX RBX RCX RDX RBP RSI RDI RSP
//                                 R8L R9L R10L R11L R12L R13L R14L R15L
//                                 R8W R9W R10W R11W R12W R13W R14W R15W
//                                 R8D R9D R10D R11D R12D R13D R14D R15D
//                                 R8 R9 R10 R11 R12 R13 R14 R15
//   baseregister                 one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI ESP
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI RSP
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   displacement                 signed 32 bit value (even in 64BIT mode)
//   absoluteaddress              signed 32 bit value (even in 64BIT mode)
//   currentcompilebufferoffset   0 based offset in bytes from start of current
//                                 compile buffer
//   scale                        index register is multiplied by the scale,
//                                 one of: SCALE1* SCALE2* SCALE4* SCALE8*
//   indexregister                one of:
//                                 NOREG or
//                                 in 32 bit mode EAX EBX ECX EDX EBP ESI EDI
//                                 in 64 bit mode RAX RCX RCX RDX RBP RSI RDI
//                                  R8 R9 R10 R11 R12 R13 R14 R15
//   [R]                          specifies a memory target at the address in
//                                 the baseregister
//   [R+N]                        specifies a memory target at the address at
//                                 the value in the base register plus the
//                                 signed displacement
//   [N]                          specifies a memory target at the
//                                 absoluteaddress which can't be larger than
//                                 a signed 32 bit integer. This makes [N] not
//                                 very useful in 64 bit mode
//   [O]                          specifies a memory target at an offset in the
//                                 current compile buffer.
//   [R+S*R+N]                    specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//
//   ->                        sets the direction to forward.
//                                 This is the default value.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   <-                        sets the direction to reverse.
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   8BIT                         sets the data size of the instruction to
//                                 1 byte
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   16BIT                        sets the data size of the instruction to
//                                 2 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   32BIT                        sets the data size of the instruction to
//                                 4 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//   64BIT                        sets the data size of the instruction to
//                                 8 bytes
//                                 This is pushed after either addressing mode
//                                 parameters and can not come in the middle
//                                 of addressing mode parameters.
//
//   minimumdisplacementsize      minimum size in bytes, one of 0, 1, or 4.
//                                 The displacement will not be encoded using
//                                 a size less than this.
//   R                            specifies a register target.
//   [MOD]                        specifies a memory target at the address of
//                                 baseregister plus displacement using modr/m
//                                 encoding. The encoding may be promoted to
//                                 sib if modr/m does not support it.
//   [SIB]                        specifies a memory target at the address at
//                                 baseregister + (scale*indexregister) +
//                                 displacement (displacement is signed)
//                                 sib encoding will be used.
//
//   datasizevalue                data size of a memory target in bytes,
//                                 can be  1, 2, 4, or 8
//   DATASIZE                     sets the data size of a memory target
//                                 This is pushed after a memory target
//                                 parameters and can not come in the middle
//                                 of a memory target.
//
// Data stack out:
//  none
//
// Execute state action:
//  Pulls two targets from the data stack and compiles the opcode sequence for
//   an x86 XADD instruction. This opcode sequence exchanges the source and
//   destination then adds them together and puts the result into the
//   destination. The source can not be a memory target. The source and
//   destination sizes must be the same.
//
// Compile state action:
//  Compiles a call to a subroutine that does the execute state action.
//
// Examples:
//  HEX
//  CX  RAX [R]  XADD,   // [RAX][15:0] -> CX
//                       // [RAX][15:0] + CX -> [RAX][15:0]
//
//  ECX  RAX [R]  XADD,  // [RAX][15:0] -> ECX
//                       // [RAX][31:0] + ECX -> [RAX][15:0]
//
//  ECX  RAX [R]  XADD,  // [RAX][63:0] -> RCX
//                       // [RAX][63:0] + RCX -> [RAX][63:0]
//
//  AX  CX  XADD,        // CX -> AX
                         // CX + AX -> CX
//
//  EAX  ECX  XADD,      // ECX -> EAX
//                       // ECX + EAX -> ECX
//                       // 0 -> RAX[127:64]
//
//  RAX  RCX  XADD,      // RCX -> RAX
//                       // RCX + RAX -> RCX
//
// Failure cases:
//  I didn't check the failure cases thoroughly, soo... you may see some
//   strange things if you are not careful.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthlocalcellsallocatecomma ( LOCAL-CELLS-ALLOCATE, )
//
// C prototype:
//  void dg_forthlocalcellsallocatecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//                                                          
// Action:
//  Pops the number of cells to allocate from the data stack, then compiles code
//   that subtracts the number of bytes needed from the return stack pointer.
//   The compiled code assumes the cell size is 64 bits and is this: 9/1/2020
//    numberofcells*8 N  RSP  SUB,
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthframeparamscurly ( FRAME-PARAMS< )
//
// C prototype:
//  void dg_forthframeparamscurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Action:
//  Clears which registers are being used.
//  Initializes the return stack depth.
//  Sets the parsing mode to parsing ints.
//  Sets the number of integer parameters variable to 0.
//  Sets the number of floating point parameters variable to 0.
//  Moves the current offset pointer for the current input buffer to the character 
//   after the next > or to the end of the buffer if > is not found.
//  For each word name found before the > or end of buffer, whichever come first:
//   If the word name is INT the parsing mode is set to parsing ints.
//   If the word name is FLOAT the parsing mode is set to parsing floats. 
//   If the word name is not INT or FLOAT, a two constant word with the parsed 
//    word name is added to the locals word list. The low UINT64 of the double constant 
//    is assigned the 0 based parameter index. The high UINT64 of the double constant 
//    is assigned a number which represents the is a frame parameter addressing mode
//    type.
//   Then a new group of words is started. 
//   The total number of integer parameters parsed goes into the number of integer
//    parameters variable.
//   The total number of floating point parameters parsed goes into the number of
//    floating point parameters variable.
//   The index assigned to a particular parameter name is not necessarily in the
//    same order as the parameter names. On Mac, the floating point parameters passed
//    on the stack come after the integer parameters on the stack, so this routine
//    on Mac hands out indexes to the integer parameters before handing out indexes
//    to the floating point ones.
//
// Note:
//  The assembler assumes the subroutine you are compiling is using the RBP register
//   as the frame pointer, which means the frame you are using does RBP PUSH,
//   RSP RBP MOV, at the entry of the subroutine.
//  The assembler converts the parameterindex isframeparam addressing mode 
//   specifier into an RBP offset [R+N] specifier.
//  Also, keep in mind parameters passed in registers are only valid until you do an
//   instruction that modifies the register.
//  The locals wordlist is cleared when ; is compiled, END-CODE is executed, or
//   ?CLEAR-LOCALS is done.
//
// Example:
//  FRAME-PARAMS< INT x FLOAT y INT z FLOAT w >
//
//  Then later when you compile assembly instructions, you can use the parameter name
//   as a target. Such as:
//
//  x  RAX  MOV,
//  y  XMM0  MOVQ,
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthnoframeparamscurly ( NO-FRAME-PARAMS< )
//
// C prototype:
//  void dg_forthnoframeparamscurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Action:
//  Clears which registers are being used.
//  Sets the parsing mode to parsing ints.
//  Sets the number of integer parameters variable to 0.
//  Sets the number of floating point parameters variable to 0.
//  Sets the PRSDEPTH to the default value for the operating system.
//  Moves the current offset pointer for the current input buffer to the character 
//   after the next > or to the end of the buffer if > is not found.
//  For each word name found before the > or end of buffer, whichever come first:
//   If the word name is INT the parsing mode is set to parsing ints.
//   If the word name is FLOAT the parsing mode is set to parsing floats. 
//   If the word name is not INT or FLOAT, a two constant word with the parsed 
//    word name is added to the locals word list. The low UINT64 of the double constant 
//    is assigned the 0 based parameter index. The high UINT64 of the double constant 
//    is assigned a number which represents the is a no frame parameter addressing
//    mode type.
//   Then a new group of words is started. 
//   The total number of integer parameters parsed goes into the number of integer
//    parameters variable.
//   The total number of floating point parameters parsed goes into the number of
//    floating point parameters variable.
//   The index assigned to a particular parameter name is not necessarily in the
//    same order as the parameter names. On Mac, the floating point parameters passed
//    on the stack come after the integer parameters on the stack, so this routine
//    on Mac hands out indexes to the integer parameters before handing out indexes
//    to the floating point ones.
//
// Note:
//  The assembler converts the parameterindex isframeparam addressing mode 
//   specifier into an RSP offset [R+N] specifier.
//  The assembler uses the current value of the PRSDEPTH variable when compiling
//   to determine the offset.
//  The value in PRSDEPTH is 0 based and goes up by 1 for each UINT64 value on the 
//   stack. On Mac, the returnaddress is on the stack at the entry of the subroutine
//   which means PRSDEPTH starts off as 1. On Windows, the returnaddress and 4
//   shadow parameters are on the stack which means PRSDEPTH starts off as 5. 
//   In order for this mode to work, you need to add 1 to PRSDEPTH for each 64 bit
//   value you push onto the return stack.
//  Also, keep in mind parameters passed in registers are only valid until you do an
//   instruction that modifies the register.
//  The locals wordlist is cleared when ; is compiled, END-CODE is executed, or
//   ?CLEAR-LOCALS is done.
//
// Example:
//  NO-FRAME-PARAMS< INT x FLOAT y INT z FLOAT w >
//
//  Then later when you compile assembly instructions, you can use the parameter name
//   as a target. Such as:
//
//  x  RAX  MOV,
//  y  XMM0  MOVQ,
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcallsubsframelocalscommacurly 
//  ( CALL-SUBS-FRAME-LOCALS,< DGLU-FORTH-FRAME-LOCALS,< )
//
// C prototype:
//  void dg_forthcallsubsframelocalscommacurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Action:
//  Moves the current offset pointer for the current input buffer to the character 
//   after the next > or to the end of the buffer if > is not found.
//  For each word name found before the > or end of buffer, whichever come first:
//   A two constant word with the parsed word name is added to the locals word list. 
//    The low UINT64 of the double constant is assigned the 0 based local parameter
//     index. 
//    The high UINT64 of the double constant is assigned a number which represents 
//     the is a local variable addressing mode type.
//   Then a new group of words is started. 
//   For each local parameter created, code is compiled to allocate room on the
//    frame for those parameters. The frame has a variable marking where the local
//    area ends and after adjusting the return stack pointer, that variable is
//    updated. This means you can have more than one CALL-SUBS-FRAME-LOCALS,<
//    in a subroutine, and can do it anywhere in the subroutine, but you can't
//    use the local variables until after they are created. Not sure why you
//    would need to do that though.
//
// Note:
//  The assembler assumes the subroutine you are compiling is using the either the
//   call subs frame or the dglu forth frame for your subroutine.
//  The assembler calculates the depth of the local variable using the value in 
//   PRSDEPTH and remembers it as an RBP offset [R+N] specifier. 
//  PRSDEPTH is initialized when either ENTER-CALL-SUBS-FRAME, or
//   FRAME-PARAMS< is done.
//  The locals wordlist is cleared when ; is compiled, END-CODE is executed, or
//   ?CLEAR-LOCALS is done.
//  The locals wordlist is a wordlist where temporary words are stored.
//  The local variables allocated by this routine are stored on the return stack.
//  Sorry about using the same name for two different things.
//
// Example:
//  CALL-SUBS-FRAME-LOCALS,< x y z >
//
//  Then later when you compile assembly instructions, you can use the local name
//   as a target. Such as:
//
//  RAX x  MOV,
//  x  XMM0  MOVQ,
//  RCX y  MOV,
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthendsubparamscomma ( )), )
//
// C prototype:
//  void dg_forthendsubparamscomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( seebelow.... -- )          
//
// Action:
//  This routine uses the number of >IPARAM >FPARAM P>IPARAM words encountered or the
//   number of IPARAM> FPARAM> words encountered since (( was done to determine how 
//   many addressing mode targets to expect on the data stack.
//  This routine uses the >IPARAM >FPARAM P>IPARAM IPARAM> FPARAM> to determine the
//   addressing mode target for the parameter being done.
//  This routine pops a value off of the data stack. It expects the value to be one
//   of >IPARAM >FPARAM P>IPARAM IPARAM> FPARAM> or the marker from ((.
//  If there are still targets to do and you get the (( marker or you finish
//   all the targets and you don't get the (( marker, it's an error.
//  If the value popped is >IPARAM it pulls a source addressing mode target off the  
//   data stack and compiles a move from the target to the current parameter target. 
//   If one or both targets are register, a MOV, is used. If both
//    targets are memory, source RAX MOV, RAX destination MOV, is used.
//   If the source target is an XMM register, it causes an error.
//  If the value popped is >FPARAM it pulls a source addressing mode target off the 
//    data stack and compiles a move from the target to the current parameter target.
//   If one or both targets are an XMM register, a MOVQ, is
//     used. If both targets are memory, source RAX MOV, RAX destination MOV, is used.
//    If the source target is a regular integer register, it causes an error.
//  If the value popped is P>IPARAM it pulls a source addressing mode target off the  
//   data stack and compiles an LEA, from the target to the current parameter target. 
//   If the destination is a register, just an LEA, is used. If both
//    targets are memory, source RAX LEA, RAX destination MOV, is used.
//   Since this is LEA, the source has to be a memory target.
//  If the value popped is IPARAM> it pulls a destination addressing mode target off 
//   the data stack and compiles a MOV, from the current parameter target to the 
//   destination target.
//  If the value popped is FPARAM> it pulls a destination addressing mode target off 
//   the data stack and compiles a MOVQ, from the current parameter target to the 
//   destination target.
//
// Note:
//  Requires using the calls subs frame.
//  Use >IPARAM and >FPARAM to set up the parameters for a subroutine call.
//  Use IPARAM> and FPARAM> to get the parameters after a subroutine call.
//  At this time you can only use either >IPARAM and >FPARAM between (( )), or
//   IPARAM> and FPARAM> between (( )), not both sets. 
//  DiaperGlu currently does not keep track of which registers get trashed, so if
//   you want to pass a function's input paramters to a subroutine you may need
//   to copy them somewhere like a local variable so they don't get trashed.
//
// Example:
//  OSYMBOL omyvariable
//    HEX 1237465 CODE-U64,
// 
//  ( subroutine start e.g. CODE functionname or OSYMBOL functionname )
//    ENTER-CALL-SUBS-FRAME,
//    FRAME-PARAMS< INT param0 INT param1 FLOAT param2 INT param3 >
//    CALL-SUBS-FRAME-LOCALS,< local0 local1 local2 local3 >
//    param0 local0 MOV,
//    param1 local1 MOV,
//    param2 local2 MOVQ,
//    param3 local3 MOV,
//    (( 
//      local0 >IPARAM      
//        // on Mac gets converted to RBP local0offset [R+N]  RDI  MOV,
//      5 N    >IPARAM      
//        // on Mac gets converted to 5 N  RSI  MOV,
//      EH. omyvariable [O]  >IPARAM  
//        // on Mac gets converted to RIP omyvariabledisplacement [R+N]  RDX  MOV,
//      local1 P>IPARAM
//        // on Mac gets converted to RBP local1offset [R+N]  RCX  LEA,
//      local2 >FPARAM
//        // on Mac gets converted to RBP local2offset [R+N]  XMM0  MOVQ,
//      8 N    >IPARAM
//        // on Mac gets converted to 8 N  R8  MOV,
//      RAX    >IPARAM
//        // on Mac gets converted to RAX  R9  MOV,
//      param3  >IPARAM
//        // on Mac gets converted to 
//        //  RBP local3offset [R+N]  RAX           MOV,
//        //  RAX                     RSP 0  [R+N]  MOV,
//    )),
//    EH. omysubroutine O CALL,
//     
//    EXIT-CALL-SUBS-FRAME,
//    (( EH. omyvariable [O]  IPARAM> )),
//      // gets converted to  RAX  RIP EH. omyvariabledisplacement [R+N]  MOV,
//    RET,
// 
// Forth standard:
//  none   
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthbeginsubparams ( (( )
//
// C prototype:
//  void dg_forthbeginsubparams (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- dg_subparamscommamarker )          
//
// Action:
//  Sets the number of call sub integer parameters to 0.
//  Sets the number of call sub floating point parameters to 0.
//  Pushes the dg_subparamscommamarker onto the data stack.
//  
//
// Forth standard:
//  none
//                                                           
//
// Example:
//  (( R8 >IPARAM R9 >IPARAM )),  // On Mac compiles: R8 RDI MOV,  R9 RSI MOV, 
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthregscurly ( REGS< )
//
// C prototype:
//  void dg_forthregscurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Action:
//  By default, available int regs are used.
//  For each word name found before the > or end of buffer, whichever come first: 
//    if the word name is INT then available int regs are used for the names that
//     come after
//    if the word name is FLOAT then available float regs are used for the names
//     that come after
//    otherwise, a constant word with the parsed word name is added to the locals 
//     word list. 
//    The UINT64 value of the constant is the id of the next unused int or float
//     register in the list of allocatable regs. If no regs are available, errors 
//     are pushed to the error stack instead. 
//
// Note:
//  NO-FRAME-PARAMS< and CALL-SUBS-FRAME-PARAMS< initializes which regs are
//    used.
//  RMASK-CALL-SUBS-FRAME-PRESERVE, and RMASK-NO-FRAME-PRESERVE, can move parameters
//   and regs that must be preserved to the return stack to add more regs to the list
//   of regs that can be used. These functions can preserve RBP but RBP is not in
//   the list of regs that can be available because the call subs frame uses it
//   as the frame pointer. This means REGS< will not allocate RBP at this time
//   even though it could be used in a no frame. 2023 Mar 29 J.N.
//
// Example:
//  REGS< INT x FLOAT y INT z FLOAT w >
//
//  Then later when you compile assembly instructions, you can use the name
//   as a register target. Such as:
//
//  x  RAX  MOV,
//  y  XMM0  MOVQ,
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthcallsubsfastlocalscommacurly ( CALL-SUBS-FRAME-FAST-LOCALS,< )
//
// C prototype:
//  void dg_forthcallsubsfastlocalscommacurly (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- )
//  ( "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>}morestuff" -currentinputbuffer- 
//     "morestuff" )
//  
// Current input buffer's current offset in:
//  "<delimiters>word1<delimiters>word2"...
//    <delimiters>wordu<delimiters>morestuff"
//
// Current input buffer's current offset out:
//  "morestuff"          
//
// Forth standard:
//  none
//                                                          
// Action:
//  By default, available int regs are used.
//  For each word name found before the > or end of buffer, whichever come first: 
//    if the word name is INT then available int regs are used for the names that
//     come after
//    if the word name is FLOAT then available float regs are used for the names
//     that come after
//    otherwise, if a reg is availabe, a constant word with the parsed word name is 
//     added to the locals word list representing the next available reg 
//    The UINT64 value of the constant is the id of the next unused int or float
//     register in the list of allocatable regs. If no regs are available, a local
//     variable on the local call subs return stack frame is allocated instead
//
// Note:
//  CALL-SUBS-FRAME-PARAMS< initializes which regs are used.
//  RMASK-CALL-SUBS-FRAME-PRESERVE, can move parameters and regs that must be 
//   preserved to the return stack to add more regs to the list of regs that can 
//   be used. (You can preserve RBP with this function but it is already in use
//   as the frame pointer and will not be available for use as a fast local).
//  Fast locals are assigned to int regs, float regs, and return stack memory.
//   I tried to add stuff so things would work like if you specified a memory
//   to memory operation, but you may still have to keep track of what type of
//   targets you are working with. For instance, ADD, only works with integer
//   registers and memory targets; and in some situations you may need to 
//   specify the data size of the operation.
//    
//
// Example:
//  REGS< INT x FLOAT y INT z FLOAT w >
//
//  Then later when you compile assembly instructions, you can use the name
//   as a register target. Such as:
//
//  x  RAX  MOV,
//  y  XMM0  MOVQ,
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstringrmask ( $-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers RCX RDI and RSI
//
// Action:
//  Pushes the rmask for the registers used in x86 string instructions which are
//   RCX RDI RSI. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstringsrcrmask ( $SRC-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing register RSI
//
// Action:
//  Pushes the rmask for the register used as the source register in x86 string 
//   instructions which is RSI. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstringdestrmask ( $DEST-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing register RDI
//
// Action:
//  Pushes the rmask for the register used as the destination register in x86 string 
//   instructions which is RDI. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthstringctrrmask ( $CTR-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing register RDI
//
// Action:
//  Pushes the rmask for the register used as the counter register in x86 string 
//   instructions which is RCX. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmathhirmask ( MATH-HI-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing register RDX
//
// Action:
//  Pushes the rmask for the register used as the high 64 bits of the destination
//   register in x86 multiply instructions. This is also the destination in some
//   division instructions.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthmustbepreservedrmask ( MUST-BE-PRESERVED-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing the must be preserved registers
//
// Action:
//  Pushes the rmask for the registers which must be preserved during a subroutine
//   call. This list is not the same on different operating systems.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthimustbepreservedrmask ( IMUST-BE-PRESERVED-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing the must be preserved integer
//                                   registers
//
// Action:
//  Pushes the rmask for the integer registers which must be preserved during a 
//   subroutine call. This list is not the same on different operating systems.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfmustbepreservedrmask ( FMUST-BE-PRESERVED-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing the must be preserved floating
//                                   point registers
//
// Action:
//  Pushes the rmask for the floating point registers which must be preserved during 
//   a subroutine call. This list is not the same on different operating systems.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthuimustbepreservedrmask ( U-IMUST-BE-PRESERVED-RMASK )
//
// C prototype:
//  void dg_forthuimustbepreservedrmask (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( u -- rmask )
//
// Data stack in:
//  u                             the number of integer registers to preserve 
//
// Data stack out:
//  rmask                         the rmask of the registers that were preserved
//                                                               
// Action:
//  Pops u off the data stack and uses it to get the lowest u set bits of the 
//   IMUST-BE-PRESERVED-RMASK. If u is greater than the number of bits set in 
//   IMUST-BE-PRESERVED-RMASK, then the IMUST-BE-PRESERVED-RMASK is pushed to the 
//   data stack. Otherwise, a mask containing the lowest u set bits in the
//   IMUST-BE-PRESERVED-RMASK is generated.
//   
//
// Note:
//  IMUST-BE-PRESERVED-RMASK includes RBP in the list of possible integer regs
//   to be preserved. RBP is not in the list of allocatable regs at this time.
//   (RBP is used as the frame pointer in the call subs frame and using it for
//   other purposes will break access to call subs frame local variables and 
//   parameters...)
//   What this means is the number of integer regs preserved with this function
//   is not necessarily the number of regs made available for use. 2023 Mar 29 J.N.
//   With that in mind, you can use this to try to make sure a certain number of 
//   registers are available for use without knowing exactly which ones they are...
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// Failure cases:
//  u is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparamsrmask ( IPARAMS-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass integer
//                                   parameters
//
// Action:
//  Pushes the rmask for the registers which can be used to pass integer parameters
//   to a subroutine. This list is not the same on different operating systems.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparamsrmask ( FPARAMS-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass floating
//                                   point parameters
//
// Action:
//  Pushes the rmask for the registers which can be used to pass floating point
//   parameters to a subroutine. This list is not the same on different operating 
//   systems.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthparamsrmask ( PARAMS-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass floating
//                                   point parameters
//
// Action:
//  Pushes the rmask for the registers which can be used to pass floating point
//   or integer parameters to a subroutine. This list is not the same on different 
//   operating systems.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthallocatablermask ( ALLOCATABLE-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to hold local
//                                   variables
//
// Action:
//  Pushes the rmask for the registers which can be used to hold local integer or
//   floating point variables. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiallocatablermask ( IALLOCATABLE-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass integer
//                                   parameters
//
// Action:
//  Pushes the rmask for the registers which can be used to hold local integer 
//   variables. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfallocatablermask ( FALLOCATABLE-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass floating
//                                   point parameters
//
// Action:
//  Pushes the rmask for the registers which can be used to hold local floating point 
//   variables. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam0rmask ( IPARAM0-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   first integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the first integer parameter to a 
//   subroutine. On Windows, this is RCX, on Mac, this is RDI.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the first parameter
//   is an integer.
//  On Mac, this is used for the first integer paramter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam1rmask ( IPARAM1-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   second integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the second integer parameter to a 
//   subroutine. On Windows, this is RDX, on Mac, this is RSI.
//
// Note:
//  On Windows, the first four paramaters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the second parameter
//   is an integer.
//  On Mac, this is used for the second integer parameter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam2rmask ( IPARAM2-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   third integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the third integer parameter to a 
//   subroutine. On Windows, this is R8, on Mac, this is RDX.
//
// Note:
//  On Windows, the first four paramaters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the third parameter
//   is an integer.
//  On Mac, this is used for the third integer parameter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam3rmask ( IPARAM3-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   fourth integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the fourth integer parameter to a 
//   subroutine. On Windows, this is R9, on Mac, this is RCX.
//
// Note:
//  On Windows, the first four paramaters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the fourth parameter
//   is an integer.
//  On Mac, this is used for the fourth integer parameter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam4rmask ( IPARAM4-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   fifth integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the fifth integer parameter to a 
//   subroutine. On Windows, the fifth parameter is not passed in a register,
//   on Mac, this is R8.
//
// Note:
//  On Windows, the first four paramaters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a register
//   and this returns an rmask of 0.
//  On Mac, this is used for the fifth integer parameter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam5rmask ( IPARAM5-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   sixth integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the sixth integer parameter to a 
//   subroutine. On Windows, the sixth parameter is not passed in a register,
//   on Mac, this is R9.
//
// Note:
//  On Windows, the first four paramaters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a register
//   and this returns an rmask of 0.
//  On Mac, this is used for the sixth integer parameter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam6rmask ( IPARAM6-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   seventh integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the seventh integer parameter to a 
//   subroutine. On Windows and Mac, the seventh integer parameter is not passed in a 
//   register.
//
// Note:
//  On Windows, the first four paramaters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a register
//   and this returns an rmask of 0.
//  On Mac, only the first six integer parameters are passed in registers so this 
//   parameter is not passed in a register and this returns an rmask of 0.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam7rmask ( IPARAM7-RMASK )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           UINT64 representing registers used to pass the
//                                   eighth integer parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the eighth integer parameter to a 
//   subroutine. On Windows and Mac, the eighth integer parameter is not passed in a 
//   register.
//
// Note:
//  On Windows, the first four paramaters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a register
//   and this returns an rmask of 0.
//  On Mac, only the first six integer parameters are passed in registers so this 
//   parameter is not passed in a register and this returns an rmask of 0.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam0rmask ( FPARAM0-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   first floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the first floating point parameter 
//   to a subroutine. On Windows, this is XMM0, on Mac, this is XMM0.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the first parameter
//   is a float.
//  On Mac, this is used for the first floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam1rmask ( FPARAM1-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   second floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the second floating point parameter 
//   to a subroutine. On Windows, this is XMM1, on Mac, this is XMM1.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the second parameter
//   is a float.
//  On Mac, this is used for the second floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam2rmask ( FPARAM2-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   third floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the third floating point parameter 
//   to a subroutine. On Windows, this is XMM2, on Mac, this is XMM2.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the third parameter
//   is a float.
//  On Mac, this is used for the third floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam3rmask ( FPARAM3-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   fourth floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the fourth floating point parameter 
//   to a subroutine. On Windows, this is XMM3, on Mac, this is XMM3.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the fourth parameter
//   is a float.
//  On Mac, this is used for the fourth floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam4rmask ( FPARAM4-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   fifth floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the fifth floating point parameter 
//   to a subroutine. On Windows, this parameter is not passed in a register,
//   on Mac, this is XMM5.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a 
//   register and the rmask is 0.
//  On Mac, this is used for the fifth floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam5rmask ( FPARAM5-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   sixth floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the sixth floating point parameter 
//   to a subroutine. On Windows, this parameter is not passed in a register,
//   on Mac, this is XMM6.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a 
//   register and the rmask is 0.
//  On Mac, this is used for the sixth floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam6rmask ( FPARAM6-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   seventh floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the seventh floating point parameter 
//   to a subroutine. On Windows, this parameter is not passed in a register,
//   on Mac, this is XMM7.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a 
//   register and the rmask is 0.
//  On Mac, this is used for the seventh floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthfparam7rmask ( FPARAM7-RMASK )
//
// C prototype:
//  none (it's a FLOAT64 constant)
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                           FLOAT64 representing the register used to pass the
//                                   eighth floating point parameter to a subroutine
//
// Action:
//  Pushes the rmask for the register used to pass the eighth floating point parameter 
//   to a subroutine. On Windows, this parameter is not passed in a register,
//   on Mac, this is XMM7.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so this parameter is not passed in a 
//   register and the rmask is 0.
//  On Mac, this is used for the eighth floating point paramter regardless of which 
//   parameter it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthavailablermask ( AVAILABLE-RMASK )
//
// C prototype:
//  void dg_forthavailablermask (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- rmask )
//
//  rmask                         FLOAT64 representing the registers that are
//                                 available for use   
//
// Action:
//  Pushes the rmask for the list of registers that are currently available for
//   use.
//
// Note:
//  This function uses the value in the variable PUSED-RMASK along with the list
//   of allocatable registers to determine which registers are available. Declaring
//   registers as parameters initializes this variable. Preserving registers makes
//   them available. Declaring registers as used makes them unavailable.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthprsdepth ( PRSDEPTH )
//
// C prototype:
//  none (It's a UINT64 address which is calculated using an offset and bufferid)
//
// Stack action shorthand:
//  ( -- a-addr )
//
// Data stack out
//  a-addr                        address of the UINT64 rsdepth variable
//                                                              
// Action:
//  Gets the address of the rsdpeth variable.
//
// Note:
//  The x86 assembler uses this variable to keep track of the return stack depth for
//   some words that create symbols to represent parameters and local
//   variables, and preserve registers on the return stack.
//  The value in in rsdepth is 0 based and represents the number of UINT64s pushed
//   onto the return stack by the functions that use it.
//  If you push your own values onto the return stack and leave them on the return
//   stack and want words like RMASK-NO-FRAME-UNPRESERVE to work correctly, you will
//   need to modify rsdepth.
//  This variable is initialized by ENTER-CALL-SUBS-FRAME, or FRAME-PARAMS< or
//   NO-FRAME-PARAMS< and modified when you declare call subs frame locals 
//   or preserve registers.
//
// Failure cases:
//  unable to get pointer to the rsdepth variable
//  unable to push to the data stack
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthppreserveddepth ( PPRESERVED-DEPTH )
//
// C prototype:
//  none (It's a UINT64 address which is calculated using an offset and bufferid)
//
// Stack action shorthand:
//  ( -- a-addr )
//
// Data stack out
//  a-addr                        address of the UINT64 preserveddepth variable
//                                                              
// Action:
//  Gets the address of the preserveddepth variable.
//
// Note:
//  The x86 assembler uses this variable to keep track of the return stack depth
//   at which registers were preserved.
//  The value is 0 based and represents the total number of UINT64s pushed
//   onto the return stack after the regs were preserved including the stuff pushed
//   onto the return stack before the regs were preserved.
//
// Failure cases:
//  unable to get pointer to the regspreserveddepth variable
//  unable to push to the data stack
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthppreservedmask ( PPRESERVED-RMASK )
//
// C prototype:
//  none (It's a UINT64 address which is calculated using an offset and bufferid)
//
// Stack action shorthand:
//  ( -- a-addr )
//
// Data stack out
//  a-addr                        address of the UINT64 preservedrmask variable
//                                                              
// Action:
//  Gets the address of the preservedrmask variable.
//
// Note:
//  The x86 assembler uses this variable to keep track of which registers were
//   preserved on the return stack.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// Failure cases:
//  unable to get pointer to the  variable
//  unable to push to the data stack
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthpusedrmask ( PUSED-RMASK )
//
// C prototype:
//  none (It's a UINT64 address which is calculated using an offset and bufferid)
//
// Stack action shorthand:
//  ( -- a-addr )
//
// Data stack out
//  a-addr                        address of the UINT64 usedrmask variable
//                                                              
// Action:
//  Gets the address of the usedrmask variable.
//
// Note:
//  The x86 assembler uses this variable to keep track of which registers are
//   in use and not available for allocation on the return stack.
//  This variable is initialized when you declare parameters using FRAME-PARAMS>
//   or NO-FRAME-PARAMS> and holds the set of registers representing the
//   parameters you just declared and the set of registers which must be
//   preserved.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// Failure cases:
//  unable to get pointer to the usedrmask variable
//  unable to push to the data stack
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrtormaskpos ( R>RMASKPOS )
//
// C prototype:
//  void dg_forthrtormaskpos (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( reg -- rmaskbitpos )
//
// Data stack in:
//  reg                           UINT64 representing a default register
// 
// Data stack out:
//  rmaskbitpos                   0 based bit index of the reg in an rmask 
//                                or -1 if reg is not an rmask reg
//                                                              
// Action:
//  Pops reg off the data stack and determines which bit position in an rmask 
//   represents the register. Then pushes the register's bit position to the data 
//   stack. If the register is not represented in the rmask, -1 is pushed to the
//   data stack instead.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  Doing 2^U on a register's bit position will give you the rmask for that register.
//
// Failure cases:
//  reg is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrmaskpostor ( RMASKPOS>R )
//
// C prototype:
//  void dg_forthrmaskpostor (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( rmaskbitpos -- reg )
//
// Data stack in:
//  rmaskbitpos                   0 based bit index of the reg in an rmask 
//                                or -1 if reg is not an rmask reg
//
// Data stack out:
//  reg                           UINT64 representing a default register
//                                                               
// Action:
//  Pops rmaskbitpos off the data stack which is a 0 based index into an array of
//   default register constants. If rmaskbitpos is less than the length of this
//   array then the UINT64 representing the register at the index is pushed 
//   back to the data stack. If rmaskbitpos is not valid, then the UINT64 
//   representing NOREG is pushed to the data stack.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// Failure cases:
//  rmaskbitpos is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrtormask ( R>RMASK )
//
// C prototype:
//  void dg_forthrtormask (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( reg -- rmask )
//
// Data stack in:
//  reg                           UINT64 representing a default register
// 
// Data stack out:
//  rmask                         an rmask with one bit set representing the register 
//                                                              
// Action:
//  Pops reg off the data stack converts it to an rmask. The resulting rmask has
//   one bit set. The bit that is set represents the register.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// Failure cases:
//  reg is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthuimustbepreservedrmask ( U-IMUST-BE-PRESERVED-RMASK )
//
// C prototype:
//  void dg_forthuimustbepreservedrmask (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( u -- rmask )
//
// Data stack in:
//  u                             the number of integer registers to include in 
//                                 the rmask 
//
// Data stack out:
//  rmask                         rmask containing u must be preserved registers
//                                                               
// Action:
//  Pops u off the data stack and uses it to get the lowest u set bits of the 
//   IMUST-BE-PRESERVED-RMASK. If u is greater than the number of bits set in 
//   IMUST-BE-PRESERVED-RMASK, then the IMUST-BE-PRESERVED-RMASK is pushed to the 
//   data stack. Otherwise, a mask containing the lowest u set bits in the
//   IMUST-BE-PRESERVED-RMASK is generated.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  You can use this to try to make sure a certain number of registers are available
//   for use without knowing exactly which ones they are...
//
// Failure cases:
//  u is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthufmustbepreservedrmask ( U-FMUST-BE-PRESERVED-RMASK )
//
// C prototype:
//  void dg_forthufmustbepreservedrmask (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( u -- rmask )
//
// Data stack in:
//  u                             the number of floating point registers to include  
//                                 in the rmask 
//
// Data stack out:
//  rmask                         rmask containing u must be preserved registers
//                                                               
// Action:
//  Pops u off the data stack and uses it to get the lowest u set bits of the 
//   FMUST-BE-PRESERVED-RMASK. If u is greater than the number of bits set in 
//   FMUST-BE-PRESERVED-RMASK, then the FMUST-BE-PRESERVED-RMASK is pushed to the 
//   data stack. Otherwise, a mask containing the lowest u set bits in the
//   FMUST-BE-PRESERVED-RMASK is generated.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  You can use this to try to make sure a certain number of registers are available
//   for use without knowing exactly which ones they are...
//
// Failure cases:
//  u is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrmasknoframepreservecomma ( RMASK-NO-FRAME-PRESERVE, )
//
// C prototype:
//  void dg_forthrmasknoframepreservecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( rmask -- )
//
// Data stack in:
//  rmask                         the rmask of the registers to preserve
//                                                               
// Action:
//  Pops rmask off the data stack and then compiles code to push the registers
//   represented by the rmask to the return stack. Then this updates the variables
//   at PRSDEPTH PREGS-PRESERVED-DEPTH PPRESERVED-RMASK and PUSED-RMASK so that some or 
//   all of the registers can be restored later, and so that references to parameters
//   that were preserved will work properly. 
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  You can use this to try to make sure a certain number of registers are available
//   for use without knowing exactly which ones they are...
//  Because I used variables and not stacks for these variables you can only use
//   RMASK-NO-FRAME-PRESERVE, once per subroutine. If you want to use this more than 
//   once you will need to save PREGS-PRESERVED-DEPTH and PPRESERVED-RMASK and then 
//   restore them before you do the corresponding RMASK-NO-FRAME-UNPRESERVE,
//  PRSDEPTH is initialized when NO-FRAME-PARAMS< is done. (This means you HAVE
//   to use NO-FRAME-PARAMS< even if you have no parameters in order for 
//   RMASK-NO-FRAME-PRESERVE, to work properly.)
//
// Failure cases:
//  rmask is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrmasknoframeunpreservecomma ( RMASK-NO-FRAME-UNPRESERVE, )
//
// C prototype:
//  void dg_forthrmasknoframeunpreservecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( rmask -- )
//
// Data stack in:
//  rmask                         the rmask of the registers to unpreserve
//                                                               
// Action:
//  Pops rmask off the data stack and then compiles code to copy the preserved 
//   values on the return stack to the registers represented by the rmask.
//   This routine uses the values at PREGS-PRESERVED-DEPTH and 
//   PRSDEPTH to calculate the location of the preserved regs when compiling
//   the unpreserve code.
//   If any of the registers in rmask are not present in the preservedrmask at 
//   PPRESERVED-RMASK, then errors are pushed to the error stack instead.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  You can use this at the end of your subroutine to restore the registers
//   that were preserved using RMASK-NO-FRAME-PRESERVE,.
//
// Failure cases:
//  rmask is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrmaskcallsubsframepreservecomma ( RMASK-CALL-SUBS-FRAME-PRESERVE, )
//
// C prototype:
//  void dg_forthrmaskcallsubsframepreservecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( rmask -- )
//
// Data stack in:
//  rmask                         the rmask of the registers to preserve
//                                                               
// Action:
//  Pops rmask off the data stack and then compiles code to push the registers
//   represented by the rmask to the return stack and update the subroutine's
//   frame's bottom of local frame variable. Then this updates the variables
//   at PRSDEPTH PREGS-PRESERVED-DEPTH PPRESERVED-RMASK and PUSED-RMASK so that some or 
//   all of the registers can be restored later, and so that references to parameters
//   that were preserved will work properly.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  You can use this to try to make sure a certain number of registers are available
//   for use without knowing exactly which ones they are...
//  Because I used variables and not stacks for these variables you can only use
//   RMASK-CALL-SUBS-FRAME-PRESERVE, once per subroutine. If you want to use this more 
//   than once you will need to save PREGS-PRESERVED-DEPTH and PPRESERVED-RMASK and 
//   then  restore them before you do the corresponding 
//   RMASK-CALL-SUBS-FRAME-UNPRESERVE,
//  PRSDEPTH is initialized when ENTER-CALL-SUBS-FRAME, or FRAME-PARAMS,< is done.
//
// Failure cases:
//  rmask is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrmaskcallsubsframeunpreservecomma ( RMASK-CALL-SUBS-FRAME-UNPRESERVE, )
//
// C prototype:
//  void dg_forthrmaskcallsubsframeunpreservecomma (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( rmask -- )
//
// Data stack in:
//  rmask                         the rmask of the registers to unpreserve
//                                                               
// Action:
//  Pops rmask off the data stack and then compiles code to copy the preserved 
//   values in the subroutine's frame's local storage on the return stack to the 
//   registers represented by the rmask.
//   This routine uses the value at PREGS-PRESERVED-DEPTH to calculate the location 
//   of the preserved regs when compiling the unpreserve code.
//   If any of the registers in rmask are not present in the preservedrmask at 
//   PPRESERVED-RMASK, then errors are pushed to the error stack instead.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  You can use this at the end of your subroutine to restore the registers
//   that were preserved using RMASK-CALL-SUBS-FRAME-PRESERVE,.
//
// Failure cases:
//  rmask is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthuiufmakesureavailablermask ( UI-UF-MAKE-SURE-AVAILABLE-RMASK )
//
// C prototype:
//  void dg_forthuiufmakesureavailablermask (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( ui uf -- rmask )
//
// Data stack in:
//  ui                            the number of available integer registers wanted
//  uf                            the number of available floating point regs wanted
//
// Data stack out:
//  rmask                         the rmask of the registers to preserve
//                                                               
// Action:
//  Pops ui and uf off the data stack and then the rmask of the registers
//   that must be preserved to make the wanted number of registers available.
//   If there are already enough available registers, an rmask of 0 is returned.
//   MUST-BE-PRESERVED registers are added to the mask first, followed by
//    registers used as parameters.
//   If there are not enough registers to preserve to get the wanted number,
//    all MUST-BE-PRESERVED registers and registers that were used as parameters
//    are added to the rmask.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  -1 -1 UI-UF-MAKE-SURE-AVAILABLE-RMASK generates an rmask to preserve as
//   many as possible.
//  Do not use this function to generate the mask for unpreserving registers
//   because a bunch of MUST-BE-PRESERVED registers are now marked as available
//   and it will be incorrect. Instead just unpreserve the MUST-BE-PRESERVED
//   registers that were preserved which you can get by doing 
//   PPRESERVED-RMASK @ MUST-BE-PRESERVED-RMASK AND 
//
// Example:
//  CODE moo
//   ENTER-CALL-SUBS-FRAME,
//   FRAME-PARAMS< x y FLOAT f0 >
//   3 2 UI-UF-MAKE-SURE-AVAILABLE-RMASK RMASK-CALL-SUBS-FRAME-PRESERVE,
//   
//   
//
// Failure cases:
//  rmask is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrmaskuse ( RMASK-USE )
//
// C prototype:
//  void dg_forthrmaskuse (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( rmask -- )
//
// Data stack in:
//  rmask                         the rmask of the registers to mark as used
//                                                               
// Action:
//  Pops rmask off the data stack and then if none of the registers indicated
//   by the rmask are currently marked as used, they are marked as used.
//   If any of the registers were already marked as used then errors are 
//   pushed to the error stack instead.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  The UINT64 rmask of the currently used registers is kept at PUSED-RMASK
//
// Failure cases:
//  rmask is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthrmaskunuse ( RMASK-UNUSE )
//
// C prototype:
//  void dg_forthrmaskunuse (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( rmask -- )
//
// Data stack in:
//  rmask                         the rmask of the registers to mark as unused
//                                                               
// Action:
//  Pops rmask off the data stack and then marks the registers indicated by the
//   rmask as unused. No errors are given if any of the registers were already
//   unused.
//   
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  The UINT64 rmask of the currently used registers is kept at PUSED-RMASK
//
// Failure cases:
//  rmask is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthruse ( R-USE )
//
// C prototype:
//  void dg_forthruse (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( reg -- )
//
// Data stack in:
//  reg                           the register to mark as used
//                                                               
// Action:
//  Pops reg off the data stack.
//  If the reg is not in the list of the registers used in an rmask then errors are
//   pushed to the error stack.
//  If the reg is in the list of registers used in an rmask but is already marked
//   as used then errors are pushed to the error stack.
//  If the reg is in the list of registers but not currently marked as used then 
//   it is marked as used. 
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  The UINT64 rmask of the currently used registers is kept at PUSED-RMASK
//
// Failure cases:
//  reg is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthunusediruse ( UNUSED-IR-USE )
//
// C prototype:
//  void dg_forthunusediruse (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- reg )
//
// Data stack out:
//  reg                           the next available register
//                                                               
// Action:
//  Finds the next unused integer register in the list of allocatable registers, marks
//   the register as used, and pushes the register's default register identifier
//   to the data stack.
//  If no integer registers are available, NOREG is pushed to the data stack.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  The UINT64 rmask of the currently used registers is kept at PUSED-RMASK
//
// Failure cases:
//  reg is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthunusedfruse ( UNUSED-FR-USE )
//
// C prototype:
//  void dg_forthunusedfruse (Bufferhandle* pBHarrayhead)
//
// Inputs:
//  Bufferhandle* pBHarrayhead    pointer to a Bufferhandle structure which is 
//                                 used as the bufferhandle for the array where the 
//                                 other bufferhandles are stored.
//
// Stack action shorthand:
//  ( -- reg )
//
// Data stack out:
//  reg                           the next available register
//                                                               
// Action:
//  Finds the next unused floatint point register in the list of allocatable registers, 
//   marks the register as used, and pushes the register's default register identifier
//   to the data stack.
//  If no floating point registers are available, NOREG is pushed to the data stack.
//
// Note:
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//  The UINT64 rmask of the currently used registers is kept at PUSED-RMASK
//
// Failure cases:
//  reg is missing from the data stack (underflow)
//  
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam0 ( IPARAM0 )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- iparam0reg )
//
//  iparam0reg                      UINT64 representing the register used to pass
//                                   the first integer parameter to a subroutine
//
// Action:
//  Pushes the default register identifier for the register used to pass the first 
//   integer parameter to a subroutine. On Windows, this is RCX, on Mac, this is RDI.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the first parameter
//   is an integer.
//  On Mac, this is used for the first integer paramter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam1 ( IPARAM1 )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- iparam1reg )
//
//  iparam1reg                      UINT64 representing the register used to pass
//                                   the second integer parameter to a subroutine
//
// Action:
//  Pushes the default register identifier for the register used to pass the second 
//   integer parameter to a subroutine. On Windows, this is RDX, on Mac, this is RSI.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the first parameter
//   is an integer.
//  On Mac, this is used for the first integer paramter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam2 ( IPARAM2 )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- iparam2reg )
//
//  iparam2reg                      UINT64 representing the register used to pass
//                                   the third integer parameter to a subroutine
//
// Action:
//  Pushes the default register identifier for the register used to pass the third 
//   integer parameter to a subroutine. On Windows, this is R8, on Mac, this is RDX.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the first parameter
//   is an integer.
//  On Mac, this is used for the first integer paramter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthiparam3 ( IPARAM3 )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- iparam3reg )
//
//  iparam3reg                      UINT64 representing the register used to pass
//                                   the fourth integer parameter to a subroutine
//
// Action:
//  Pushes the default register identifier for the register used to pass the third 
//   integer parameter to a subroutine. On Windows, this is R9, on Mac, this is RCX.
//
// Note:
//  On Windows, the first four parameters are passed in registers regardless of 
//   whether they are integer or float, so you would use this if the first parameter
//   is an integer.
//  On Mac, this is used for the first integer paramter regardless of which parameter
//   it is in the parameter list.
//  An rmask is an array of bits where each set bit represents a register from the 
//   set of registers which can be used to hold local 64 bit variables and/or
//   parameters in a subroutine.
//
// //////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////
//
// dg_forthshadowsize ( SHADOWSIZE )
//
// C prototype:
//  none (it's a UINT64 constant)
//
// Stack action shorthand:
//  ( -- shadowsize )
//
//  shaddowsize                     UINT64 size in bytes of the stack parameter shadow
//                                   region
//
// Action:
//  Pushes the platform specific size of the shadow region for parameters passed on
//   the return stack to the data stack. On Windows, this is 0x20. On Mac this is 0.
//
// //////////////////////////////////////////////////////////////////////////////////////