diff --git a/patches/lua-v5.4.4.patch b/patches/lua-v5.4.4.patch index 2e913021d..dd2256856 100644 --- a/patches/lua-v5.4.4.patch +++ b/patches/lua-v5.4.4.patch @@ -12,10 +12,10 @@ index d42d14b7..75647e72 100644 #define LUAI_MAXSTACK 15000 #endif diff --git a/makefile b/makefile -index d46e650c..e347e614 100644 +index d46e650c..c27e5677 100644 --- a/makefile +++ b/makefile -@@ -66,13 +66,23 @@ LOCAL = $(TESTS) $(CWARNS) +@@ -66,13 +66,25 @@ LOCAL = $(TESTS) $(CWARNS) # enable Linux goodies @@ -32,6 +32,8 @@ index d46e650c..e347e614 100644 +OPTFLAGS= -march=sandybridge +else ifeq ($(uname_m), aarch64) +OPTFLAGS= -march=armv8.2-a+fp16+rcpc+dotprod+crypto ++else ifeq ($(uname_m), s390x) ++OPTFLAGS= -march=native +else + $(error ERROR: unknown architecture $(uname_m)) +endif diff --git a/src/core/dash_internal.h b/src/core/dash_internal.h index f57ed2bdd..0dd13f7b6 100644 --- a/src/core/dash_internal.h +++ b/src/core/dash_internal.h @@ -879,6 +879,34 @@ unsigned BucketBase::UnsetStashPtr(uint8_t fp_hash, unsigned return res; } +#ifdef __s390x__ +template +uint32_t BucketBase::CompareFP(uint8_t fp) const { + static_assert(FpArray{}.size() <= 16); + vector unsigned char v1; + + // Replicate 16 times fp to key_data. + for (int i = 0; i < 16; i++) { + v1[i] = fp; + } + + // Loads 16 bytes of src into seg_data. + vector unsigned char v2 = vec_load_len(finger_arr_.data(), 16); + + // compare 1-byte vectors seg_data and key_data, dst[i] := ( a[i] == b[i] ) ? 0xFF : 0. + vector bool char rv_mask = vec_cmpeq(v1, v2); + + // collapses 16 msb bits from each byte in rv_mask into mask. + int mask = 0; + for (int i = 0; i < 16; i++) { + if (rv_mask[i]) { + mask |= 1 << i; + } + } + + return mask; +} +#else template uint32_t BucketBase::CompareFP(uint8_t fp) const { static_assert(FpArray{}.size() <= 16); @@ -898,6 +926,7 @@ uint32_t BucketBase::CompareFP(uint8_t fp) const { // Note: Last 2 operations can be combined in skylake with _mm_cmpeq_epi8_mask. return mask; } +#endif // Bucket slot array goes from left to right: [x, x, ...] // Shift right vacates the first slot on the left by shifting all the elements right and diff --git a/src/core/detail/bitpacking.cc b/src/core/detail/bitpacking.cc index 3f0438971..310544bf9 100644 --- a/src/core/detail/bitpacking.cc +++ b/src/core/detail/bitpacking.cc @@ -104,6 +104,42 @@ static inline pair simd_variant2_pack(const char* ascii, // See https://github.com/lemire/fastvalidate-utf-8/ // The function returns true (1) if all chars passed in src are // 7-bit values (0x00..0x7F). Otherwise, it returns false (0). +#ifdef __s390x__ +bool validate_ascii_fast(const char* src, size_t len) { + size_t i = 0; + + // Initialize a vector in which all the elements are set to zero. + vector unsigned char has_error = vec_splat_s8(0); + if (len >= 16) { + for (; i <= len - 16; i += 16) { + // Load 16 bytes from buffer into a vector. + vector unsigned char current_bytes = vec_load_len((signed char*)(src + i), 16); + // Perform a bitwise OR operation between the current and the previously loaded contents. + has_error = vec_orc(has_error, current_bytes); + } + } + + // Initialize a vector in which all the elements are set to an invalid ASCII value. + vector unsigned char rep_invalid_values = vec_splat_s8(0x80); + + // Perform bitwise AND-complement operation between two vectors. + vector unsigned char andc_result = vec_andc(rep_invalid_values, has_error); + + // Tests whether any of corresponding elements of the given vectors are not equal. + // After the bitwise operation, both vectors should be equal if ASCII values. + if (!vec_all_eq(rep_invalid_values, andc_result)) { + return false; + } + + for (; i < len; i++) { + if (src[i] & 0x80) { + return false; + } + } + + return true; +} +#else bool validate_ascii_fast(const char* src, size_t len) { size_t i = 0; __m128i has_error = _mm_setzero_si128(); @@ -123,6 +159,7 @@ bool validate_ascii_fast(const char* src, size_t len) { return !error_mask; } +#endif // len must be at least 16 void ascii_pack(const char* ascii, size_t len, uint8_t* bin) { diff --git a/src/core/sse_port.h b/src/core/sse_port.h index 692c24963..2c966bbdd 100644 --- a/src/core/sse_port.h +++ b/src/core/sse_port.h @@ -5,6 +5,8 @@ #pragma once #if defined(__aarch64__) #include "base/sse2neon.h" +#elif defined(__s390x__) +#include #else #include #include @@ -12,6 +14,7 @@ namespace dfly { +#ifndef __s390x__ inline __m128i mm_loadu_si128(const __m128i* ptr) { #if defined(__aarch64__) __m128i res; @@ -22,5 +25,6 @@ inline __m128i mm_loadu_si128(const __m128i* ptr) { return _mm_loadu_si128(ptr); #endif } +#endif } // namespace dfly