SSSE3, SSE4.1, and SSE4.2 Intrinsics
Required include files:
- MMX: mmintrin.h
- SSE: xmmintrin.h
- SSE2: emmintrin.h
- SSE3: pmmintrin.h
- SSSE3: tmmintrin.h
- SSE4A: ammintrin.h
- SSE4.1: smmintrin.h
- SSE4.2: nmmintrin.h
SSSE3
SSSE3 Assembler Syntax and Corresponding Compiler Intrinsics PSIGNB, PSIGNW, PSIGND - Syntax: psignb/psignw/psignd mem64/mmxreg, mmxreg psignb/psignw/psignd mem128/xmmxreg, xmmxreg - Semantic: Packed Sign - Corresponding intrinsics: extern __m64 _mm_sign_pi8 (__m64 p1, __m64 p2); extern __m64 _mm_sign_pi16 (__m64 p1, __m64 p2); extern __m64 _mm_sign_pi32 (__m64 p1, __m64 p2); extern __m128i _mm_sign_epi8 (__m128i p1, __m128i p2); extern __m128i _mm_sign_epi16 (__m128i p1, __m128i p2); extern __m128i _mm_sign_epi32 (__m128i p1, __m128i p2); PABSB, PABSW, PABSD - Syntax: pabsb/pabsw/pabsd mem64/mmxreg, mmxreg pabsb/pabsw/pabsd mem128/xmmxreg, xmmxreg - Semantic: Packed Absolute Value - Corresponding intrinsics: extern __m64 _mm_abs_pi8 (__m64 p); extern __m64 _mm_abs_pi16 (__m64 p); extern __m64 _mm_abs_pi32 (__m64 p); extern __m128i _mm_abs_epi8 (__m128i p); extern __m128i _mm_abs_epi16 (__m128i p); extern __m128i _mm_abs_epi32 (__m128i p); PALIGNR - Syntax: palignr imm, mem64/mmxreg, mmxreg palignr imm, mem128/xmmreg, xmmreg - Semantic: Packed Align Right - Corresponding intrinsics: extern __m64 _mm_alignr_pi8 (__m64 p1, __m64 p2, int immd); extern __m128i _mm_alignr_epi8 (__m128i p1, __m128i p2, int immd); PSHUFB - Syntax: pshufb mem64/mmxreg, mmxreg pshufb mem128/xmmxreg, xmmxreg - Semantic: Packed Shuffle Bytes - Corresponding intrinsics: extern __m64 _mm_shuffle_pi8 (__m64 p1, __m64 p2); extern __m128i _mm_shuffle_epi8 (__m128i p1, __m128i p2); PMULHRSW - Syntax: pmulhrsw mem64/mmxreg, mmxreg pmulhrsw mem128/xmmxreg, xmmxreg - Semantic: Packed Multiply High with Round and Scale - Corresponding intrinsics: extern __m64 _mm_mulhrs_pi16 (__m64 p1, __m64 p2); extern __m128i _mm_mulhrs_epi16 (__m128i p1, __m128i p2); PMADDUBSW - Syntax: pmaddubsw mem64/mmxreg, mmxreg pmaddubsw mem128/xmmxreg, xmmxreg - Semantic: Multiply and Add Packed Signed and Unsigned Bytes - Corresponding intrinsics: extern __m64 _mm_maddubs_pi16 (__m64 p1, __m64 p2); extern __m128i _mm_maddubs_epi16 (__m128i p1, __m128i p2); PHSUBW, PHSUBD - Syntax: phsubw/phsubd mem64/mmxreg, mmxreg phsubw/phsubd mem128/xmmxreg, xmmxreg - Semantic: Packed Horizontal Subtract - Corresponding intrinsics: extern __m64 _mm_hsub_pi16 (__m64 p1, __m64 p2); extern __m64 _mm_hsub_pi32 (__m64 p1, __m64 p2); extern __m128i _mm_hsub_epi16 (__m128i p1, __m128i p2); extern __m128i _mm_hsub_epi32 (__m128i p1, __m128i p2); PHSUBSW - Syntax: phsubsw mem64/mmxreg, mmxreg phsubsw mem128/xmmxreg, xmmxreg - Semantic: Packed Horizontal Subtract and Saturate Words - Corresponding intrinsics: extern __m64 _mm_hsubs_pi16 (__m64 p1, __m64 p2); extern __m128i _mm_hsubs_epi16 (__m128i p1, __m128i p2); PHADDW, PHADDD - Syntax: phaddw/phaddd mem64/mmxreg, mmxreg phaddw/phaddd mem128/xmmxreg, xmmxreg - Semantic: Packed Horizontal Add - Corresponding intrinsics: extern __m64 _mm_hadd_pi16 (__m64 p1, __m64 p2); extern __m64 _mm_hadd_pi32 (__m64 p1, __m64 p2); extern __m128i _mm_hadd_epi16 (__m128i p1, __m128i p2); extern __m128i _mm_hadd_epi32 (__m128i p1, __m128i p2); PHADDSW - Syntax: phaddsw mem64/mmxreg, mmxreg phaddsw mem128/xmmxreg, xmmxreg - Semantic: Packed Horizontal Add and Saturate Words - Corresponding intrinsics: extern __m64 _mm_hadds_pi16 (__m64 p1, __m64 p2); extern __m128i _mm_hadds_epi16 (__m128i p1, __m128i p2);
SSE4.1
SSE4.1 Assembler Syntax and Corresponding Compiler Intrinsics (Rev 1.0) BLENDPD/BLENDPS - Syntax: Blend packed double/single precision floating point values blendpd/blendps $imm8, xmmreg/mem128, xmmreg - Semantic: Copy elements from one location to another based on bits of an immediate operand - Corresponding intrinsics: __m128d _mm_blend_pd(__m128d p1, __m128d p2, const int immd); __m128 _mm_blend_ps(__m128 p1, __m128 p2, const int immd); BLENDVPD/BLENDVPS - Syntax: Variable blend double/single precision floating point values blendvpd/blendvps xmmreg/mem128, xmmreg blendvpd/blendvps XMMREG, xmmreg/mem128, xmmreg - Semantic: Copy elements from one location to another based on bits in register XMMREG - Corresponding intrinsics: __m128d _mm_blendv_pd(__m128d p1, __m128d p2, __m128d p3); __m128 _mm_blendv_ps(__m128 p1, __m128 p2, __m128 p3); DPPD/DPPS - Syntax: Dot product of packed double/single precision floating point values dppd/dpps $imm8, xmmreg/mem128, xmmreg - Semantic: Based on bits in the immediate operand to select which of the entries in the input to multiply and accumulate, and to select whether to put 0 or the dot-product in the correspondent field of the result register - Corresponding intrinsics: __m128d _mm_dp_pd(__m128d p1, __m128d p2, const int immd); __m128 _mm_dp_ps(__m128 p1, __m128 p2, const int immd); EXTRACTPS - Syntax: Extract packed single precision floating point value extractps $imm8, xmmreg, reg32/mem32 extractps $imm8, xmmreg, reg64/mem64 - Semantic: Based on bits in the immediate operand to extract a field from the source register and insert it into an x86 register or memory address - Corresponding intrinsics: int _mm_extract_ps(__m128 p1, const int immd); INSERTPS - Syntax: Insert packed single precision floating point value insertps $imm8, xmmreg/mem32, xmmreg - Semantic: Load a floating point value from memory indicated by mem32 or based on bits in the immediate operand to select a single precision floating point value from the source xmmreg and insert it into the destination register also based on the bits of the immediate operand - Corresponding intrinsics: __m128 _mm_insert_ps(__m128 p1, __m128 p2, const int immd); MOVNTDQA - Syntax: Load 16 bytes with non-temporal Algined Hint movntdqa mem128, xmmreg - Semantic: Load from write-combining memory area into xmm register - Corresponding intrinsics: __m128i _mm_stream_load_si128(__m128i *p); MPSADBW - Syntax: Calculate muliple packed sums of absolute difference mpsadbw $imm8, xmmreg/mem128, xmmreg - Semantic: Based on bits in the immediate operand to select the destination and source fields to be used, compute eight offset sums of absolute differences for (|x0-y0|+|x1-y1|+|x2-y2|+...) - Corresponding intrinsics: __m128i _mm_mpsadbw_epu8(__m128i p1, __m128i p2, const int immd); PACKUSDW - Syntax: Pack with Unsigned Saturation packusdw xmmreg/mem128, xmmreg - Semantic: Convert signed 4 bytes in source and destination operands into unsigned 2 bytes with saturation - Corresponding intrinsics: __m128i _mm_packus_epi32(__m128i p1, __m128i p2); PBLENDW - Syntax: Blend packed 16-byte words pblendw $imm8, xmmreg/mem128, xmmreg - Semantic: Based on bits in the immediate operand, select 16-byte values from the second and destination operands to be stored into the destination operand - Corresponding intrinsics: __m128i _mm_blend_epi16(__m128i p1, __m128i p2, const int p3); PCMPEQQ - Syntax: Compare packed 64-bit values for equality pcmpeqq xmmreg/mem128, xmmreg - Semantic: Compare packed 64-bit values in source and destination operand for equality. Set all 0s or all 1s in destination register as result - Corresponding intrinsics: __m128i _mm_cmpeq_epi64(__m128i p1, __m128i p2); PEXTRB/PEXTRW/PEXTRD/PEXTRQ - Syntax: Extract byte/16-bit value/32-bit value/64-bit value pextrb $imm8, xmmreg, reg32/mem8 pextrb $imm8, xmmreg, reg64/mem8 pextrw $imm8, xmmreg, reg32/mem16 pextrw $imm8, xmmreg, reg64/mem16 pextrd $imm8, xmmreg, reg32/mem32 pextrq $imm8, xmmreg, reg64/mem64 - Semantic: Based on bits in the immediate operand to select and extract a 8/16/32/64-bit value from the xmmreg and store into the destination operand - Corresponding intrinsics: int _mm_extract_epi8(__m128i p1, const int immd); int _mm_extract_epi16(__m128i p1, const int immd); int _mm_extract_epi32(__m128i p1, const int immd); long long _mm_extract_epi64(__m128i p1, const int immd); PHMINPOSUW - Syntax: Packed horizontal 16-bit value minimum phminposuw xmmreg/mem128, xmmreg - Semantic: Find the minimum unsigned 16-bit value in the source operand and place the value and its index in the destination register - Corresponding intrinsics: __m128i _mm_minpos_epu16(__m128i p1); PINSRB/PINSRD/PINSRQ - Syntax: Insert byte, 32-bit value, 64-bit value pinsrb $imm8, reg32/mem8, xmmreg pinsrd $imm8, reg32/mem32, xmmreg pinsrq $imm8, reg64/mem64, xmmreg - Semantic: Based on the bits in the immediate operand to insert the byte/32-bit/64-bit value from the source operand into the destination xmm register - Corresponding intrinsics: __m128i _mm_insert_epi8(__m128i p1, int p2, const int immd); __m128i _mm_insert_epi32(__m128i p1, int p2, const int immd); __m128i _mm_insert_epi64(__m128i p1, long long p2, const int immd); PMAXSB/PMAXSD - Syntax: Maximum of packed signed byte/32-bit integers pmaxsb xmmreg/mem128, xmmreg pmaxsd xmmreg/mem128, xmmreg - Semantic: Compare the packed signed byte/32-bit values in the 2 operands and store the maximum packed values in the destination register - Corresponding intrinsics: __m128i _mm_max_epi8(__m128i p1, __m128i p2); __m128i _mm_max_epi32(__m128i p1, __m128i p2); PMAXUW/PMAXUD - Syntax: Maximum of packed unsigned 16-bit/32-bit integers pmaxuw xmmreg/mem128, xmmreg pmaxud xmmreg/mem128, xmmreg - Semantic: Compare the packed unsigned 16-bit/32-bit values in the 2 operands and store the maximum packed values in the destination register - Corresponding intrinsics: __m128i _mm_max_epu16(__m128i p1, __m128i p2); __m128i _mm_max_epu32(__m128i p1, __m128i p2); PMINSB/PMINSD - Syntax: Minimum of packed signed byte/32-bit integers pminsb xmmreg/mem128, xmmreg pminsd xmmreg/mem128, xmmreg - Semantic: Compare the packed signed byte/32-bit values in the 2 operands and store the minimum packed values in the destination register - Corresponding intrinsics: __m128i _mm_min_epi8(__m128i p1, __m128i p2); __m128i _mm_min_epi32(__m128i p1, __m128i p2); PMINUW/PMINUD - Syntax: Minimum of packed unsigned 16-bit/32-bit integers pminuw xmmreg/mem128, xmmreg pminud xmmreg/mem128, xmmreg - Semantic: Compare the packed unsigned 32-bit values in the 2 operands and store the minimum packed values in the destination register - Corresponding intrinsics: __m128i _mm_min_epu32(__m128i p1, __m128i p2); __m128i _mm_min_epu16(__m128i p1, __m128i p2); PMOVSXBW/PMOVSXBD/PMOVSXBQ/PMOVSXWD/PMOVSXWQ/PMOVSXDQ - Syntax: Move packed values with sign extension pmovsxbw xmmreg/mem64, xmmreg pmovsxbd xmmreg/mem32, xmmreg pmovsxbq xmmreg/mem16, xmmreg pmovsxwd xmmreg/mem64, xmmreg pmovsxwq xmmreg/mem32, xmmreg pmovsxdq xmmreg/mem64, xmmreg - Semantic: Sign extend 8/4/2 packed 8-bit values or 4/2 packed 16-bit values or 2 packed 32-bit values in the source operand and move it into 8/4/2 packed 16-bit/32-bit/64-bit values or 4/2 packed 32-bit/64-bit values or 2 packed 64-bit values in the destination register respectively - Corresponding intrinsics: __m128i _mm_cvtepi8_epi16(__m128i p1); __m128i _mm_cvtepi8_epi32(__m128i p1); __m128i _mm_cvtepi8_epi64(__m128i p1); __m128i _mm_cvtepi16_epi32(__m128i p1); __m128i _mm_cvtepi16_epi64(__m128i p1); __m128i _mm_cvtepi32_epi64(__m128i p1); PMOVZXBW/PMOVZXBD/PMOVZXBQ/PMOVZXWD/PMOVZXWQ/PMOVZXDQ - Syntax: Move packed values with zero extension pmovzxbw xmmreg/mem64, xmmreg pmovzxbd xmmreg/mem32, xmmreg pmovzxbq xmmreg/mem16, xmmreg pmovzxwd xmmreg/mem64, xmmreg pmovzxwq xmmreg/mem32, xmmreg pmovzxdq xmmreg/mem64, xmmreg - Semantic: Zero extend 8/4/2 packed 8-bit values or 4/2 packed 16-bit values or 2 packed 32-bit values in the source operand and move it into 8/4/2 packed 16-bit/32-bit/64-bit values or 4/2 packed 32-bit/64-bit values or 2 packed 64-bit values in the destination register respectively - Corresponding intrinsics: __m128i _mm_cvtepu8_epi16(__m128i p1); __m128i _mm_cvtepu8_epi32(__m128i p1); __m128i _mm_cvtepu8_epi64(__m128i p1); __m128i _mm_cvtepu16_epi32(__m128i p1); __m128i _mm_cvtepu16_epi64(__m128i p1); __m128i _mm_cvtepu32_epi64(__m128i p1); PMULDD/PMULDQ - Syntax: Multiply packed signed 32-bit/64-bit integers pmuldd xmmreg/mem128, xmmreg pmuldq xmmreg/mem128, xmmreg - Semantic: Multiply the packed signed 32-bit/64-bit values in the 2 operands and store the 32-bit/64-bit result in the destination register - Corresponding intrinsics: __m128i _mm_mullo_epi32(__m128i p1, __m128i p2); __m128i _mm_mul_epi32(__m128i p1, __m128i p2); PTEST - Syntax: Logical compare ptest xmmreg/mem128, xmmreg - Semantic: Set the Z flag if any of the bits in the 2 operands matched and the C flag if all of them matched. - Corresponding intrinsics: int _mm_testz_si128(__m128i p1, __m128i p2); int _mm_testc_si128(__m128i p1, __m128i p2); int _mm_testnzc_si128(__m128i p1, __m128i p2); ROUNDPS/ROUNDPD - Syntax: Round packed single/double precision floating point values roundps $imm8, xmmreg/mem128, xmmreg roundpd $imm8, xmmreg/mem128, xmmreg - Semantic: Based on the rounding mode in the immediate operand, round the single/double precision packed values in the source operand and place them in the destination register - Corresponding intrinsics: __m128 _mm_round_ps(__m128 p1, int immd); __m128 _mm_floor_ps(__m128 p1); __m128 _mm_cell_ps(__m128 p1); __m128d _mm_round_pd(__m128d p1, int immd); __m128d _mm_floor_pd(__m128d p1); __m128d _mm_cell_pd(__m128d p1); ROUNDSS/ROUNDSD - Syntax: Round scalar single/double precision floating point values roundss $imm8, xmmreg/mem64, xmmreg roundsd $imm8, xmmreg/mem32, xmmreg - Semantic: Based on the rounding mode in the immediate operand, round the single/double precision scalar low value in the source operand and place it in the destination register - Corresponding intrinsics: __m128 _mm_round_ss(__m128 p1, __m128 p2, int immd); __m128 _mm_floor_ss(__m128 p1, __m128 p2); __m128 _mm_cell_ss(__m128 p1, __m128 p2); __m128d _mm_round_sd(__m128d p1, __m128d p2, int immd); __m128d _mm_floor_sd(__m128d p1, __m128d p2); __m128d _mm_cell_sd(__m128d p1, __m128d p2);
SSE4.2
SSE4.2 Assembler Syntax and Corresponding Compiler Intrinsics CRC32 - Syntax: Accumulate CRC32 value crc32 reg8/reg16/reg32/mem8/mem16/mem32, reg32 crc32 reg8/reg64/mem8/mem64, reg64 crc32b reg8/mem8, reg32 crc32b reg8/mem8, reg64 crc32w reg16/mem16, reg32 crc32l reg32/mem32, reg32 crc32q reg64/mem64, reg64 - Semantic: Accumulate CRC32C value using the polynomial 0x11edc7f41 - Corresponding intrinsics: unsigned int _mm_crc32_u8(unsigned int crc, unsigned char data); unsigned int _mm_crc32_u16(unsigned int crc, unsigned short data); unsigned int _mm_crc32_u32(unsigned int crc, unsigned int data); unsigned long long _mm_crc32_u64(unsigned long long crc, unsigned long long data); PCMPESTRI - Syntax: Packed compare explicit length strings, return index pcmestri $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with explicit lengths, generate an index stored to %ecx - Corresponding intrinsics: int _mm_cmpestri(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestra(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrc(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestro(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrs(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrz(__m128i a, int len_a, __m128i b, int len_b, const int imm); PCMPESTRM - Syntax: Packed compare explicit length strings, return mask pcmestrm $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with explicit lengths, generate a mask stored to %xmm0 - Corresponding intrinsics: int _mm_cmpestrm(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestra(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrc(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestro(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrs(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrz(__m128i a, int len_a, __m128i b, int len_b, const int imm); PCMISPTRI - Syntax: Packed compare implicit length strings, return index pcmistri $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with implicit lengths, generate an index stored to %ecx - Corresponding intrinsics: int _mm_cmpistri(__m128i a, __m128i b, const int imm); int _mm_cmpistra(__m128i a, __m128i b, const int imm); int _mm_cmpistrc(__m128i a, __m128i b, const int imm); int _mm_cmpistro(__m128i a, __m128i b, const int imm); int _mm_cmpistrs(__m128i a, __m128i b, const int imm); int _mm_cmpistrz(__m128i a, __m128i b, const int imm); PCMPESTRM - Syntax: Packed compare explicit length strings, return mask pcmestrm $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with explicit lengths, generate a mask stored to %xmm0 - Corresponding intrinsics: int _mm_cmpestrm(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestra(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrc(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestro(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrs(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrz(__m128i a, int len_a, __m128i b, int len_b, const int imm); PCMISPTRI - Syntax: Packed compare implicit length strings, return index pcmistri $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with implicit lengths, generate an index stored to %ecx - Corresponding intrinsics: int _mm_cmpistri(__m128i a, __m128i b, const int imm); int _mm_cmpistra(__m128i a, __m128i b, const int imm); int _mm_cmpistrc(__m128i a, __m128i b, const int imm); int _mm_cmpistro(__m128i a, __m128i b, const int imm); int _mm_cmpistrs(__m128i a, __m128i b, const int imm); int _mm_cmpistrz(__m128i a, __m128i b, const int imm); PCMPESTRM - Syntax: Packed compare explicit length strings, return mask pcmestrm $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with explicit lengths, generate a mask stored to %xmm0 - Corresponding intrinsics: int _mm_cmpestrm(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestra(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrc(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestro(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrs(__m128i a, int len_a, __m128i b, int len_b, const int imm); int _mm_cmpestrz(__m128i a, int len_a, __m128i b, int len_b, const int imm); PCMISPTRI - Syntax: Packed compare implicit length strings, return index pcmistri $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with implicit lengths, generate an index stored to %ecx - Corresponding intrinsics: int _mm_cmpistri(__m128i a, __m128i b, const int imm); int _mm_cmpistra(__m128i a, __m128i b, const int imm); int _mm_cmpistrc(__m128i a, __m128i b, const int imm); int _mm_cmpistro(__m128i a, __m128i b, const int imm); int _mm_cmpistrs(__m128i a, __m128i b, const int imm); int _mm_cmpistrz(__m128i a, __m128i b, const int imm); PCMPISTRM - Syntax: Packed compare implicit length strings, return mask pcmistrm $imm8, xmmreg/mem128, xmmreg - Semantic: Perform packed comparison of string data with implicit lengths, generate a mask stored to %xmm0 - Corresponding intrinsics: int _mm_cmpistrm(__m128i a, __m128i b, const int imm); int _mm_cmpistra(__m128i a, __m128i b, const int imm); int _mm_cmpistrc(__m128i a, __m128i b, const int imm); int _mm_cmpistro(__m128i a, __m128i b, const int imm); int _mm_cmpistrs(__m128i a, __m128i b, const int imm); int _mm_cmpistrz(__m128i a, __m128i b, const int imm); PCMPGTQ - Syntax: Compare packed data for greater than pcmpgtq xmmreg/mem128, xmmreg - Semantic: Compare packed 64-bit values in xmmreg/mem128 with xmmreg. Set corresponding data in destination register to all 1s or 0s based on the result of the greater than compare. - Corresponding intrinsics: __m128i _mm_cmpgt_epi64(__m128i a, __m128i b); POPCNT - Syntax: Population count popcnt reg16/mem16, reg16 popcnt reg32/mem32, reg32 popcnt reg64/mem64, reg64 - Semantic: Count the number of set bits in reg/mem - Corresponding intrinsics: int _mm_popcnt_u32(unsigned int a); long long _mm_popcnt_u64(unsigned long long a);