Submission #304634


Source Code Expand

#include <iostream>
#include <cstdint>
#include <emmintrin.h>

using namespace std;

//-----------------------------------------------------------------------------
// __attribute__((aligned(16)))
//   変数の先頭アドレスが16バイト境界となるように領域を確保する。
//   GCCやClangで利用可能な拡張。VC++では__declspec(align(16))。
int32_t x[50000] __attribute__((aligned(16)));
int32_t y[50000] __attribute__((aligned(16)));

//-----------------------------------------------------------------------------
// SSE4.1で追加された命令のラッパー。
// AtCoderのコンパイルオプションだとintrinsicではSSE2までしか扱えないため。

// 64bit整数2つの比較 (greater than)
// [ a_0, a_1 ], [ b_0, b_1 ]
//   => [ (a_0 > b_0 ? -1 : 0), (a_1 > b_1 ? -1 : 0) ]
static inline __m128i __attribute__((always_inline))
_mm_cmpgt_epi64(__m128i a, __m128i b){
	__asm__("pcmpgtq %1, %0" : "+x" (a) : "xm" (b));
	return a;
}

// 32bit整数2つの積 (64bitに拡張)
// [ a_0, a_1, a_2, a_3 ] * [ b_0, b_1, b_2, b_3 ]
//   => [ (ll)a_0 * b_0, (ll)a_2 * b_2 ]
static inline __m128i __attribute__((always_inline))
_mm_mul_epi32(__m128i a, __m128i b){
	__asm__("pmuldq %1, %0" : "+x" (a) : "xm" (b));
	return a;
}

//-----------------------------------------------------------------------------
int main(){
	ios_base::sync_with_stdio(false);
	int n;
	int64_t s1, s2;
	cin >> n >> s1 >> s2;
	for(int i = 0; i < n; ++i){ cin >> x[i] >> y[i]; }
	int64_t answer = 0;
	// 上限・下限のベクトル化
	// lo = [ s1 - 1, s1 - 1 ]
	const __m128i lo = _mm_set1_epi64x(s1 - 1);
	// hi = [ s2, s2 ]
	const __m128i hi = _mm_set1_epi64x(s2);
	// 全部の組み合わせを試す
	for(int i = 0; i < n; ++i){
		// xi = [ x[i], x[i], x[i], x[i] ]
		const __m128i xi = _mm_set1_epi32(x[i]);
		// yi = [ y[i], y[i], y[i], y[i] ]
		const __m128i yi = _mm_set1_epi32(y[i]);
		// sum = [ 0, 0 ]
		__m128i sum = _mm_setzero_si128();
		// 1回のループで4要素を処理
		for(int j = 0; j + 3 < i; j += 4){
			// xj = [ x[j], x[j + 1], x[j + 2], x[j + 3] ]
			const __m128i xj = _mm_load_si128((__m128i *)(x + j));
			// yj = [ y[j], y[j + 1], y[j + 2], y[j + 3] ]
			const __m128i yj = _mm_load_si128((__m128i *)(y + j));
			// dx = [ x[i] - x[j], x[i] - x[j + 1], x[i] - x[j + 2], x[i] - x[j + 3] ]
			const __m128i dx = _mm_sub_epi32(xi, xj);
			// dy = [ y[i] - y[j], y[i] - y[j + 1], y[i] - y[j + 2], y[i] - y[j + 3] ]
			const __m128i dy = _mm_sub_epi32(yi, yj);
			// shift_dx = [ dx_1, dx_2, dx_3, 0 ]
			const __m128i shift_dx = _mm_srli_si128(dx, 4); // 4バイト右シフト
			// shift_dy = [ dy_1, dy_2, dy_3, 0 ]
			const __m128i shift_dy = _mm_srli_si128(dy, 4); // 4バイト右シフト
			// z = [ dx_0 * dy_0, dx_1 * dy_1, dx_2 * dy_2, dx_3 * dy_3 ]
			// を2要素ずつz0, z1に分割する (64bit整数を扱うため)
			// z0 = [ dx_0 * dy_0, dx_2 * dy_2 ]
			const __m128i z0 = _mm_mul_epi32(dx, dy);
			// z1 = [ dx_1 * dy_1, dx_3 * dy_3 ]
			const __m128i z1 = _mm_mul_epi32(shift_dx, shift_dy);
			// s1, s2 と各要素を比較する
			// z0_lo = [ (z_0 > s1 - 1 ? -1 : 0), (z_2 > s1 - 1 ? -1 : 0) ]
			//       = [ (z_0 >= s1 ? -1 : 0), (z_2 >= s1 ? -1 : 0) ]
			const __m128i z0_lo = _mm_cmpgt_epi64(z0, lo);
			// z1_lo = [ (z_1 > s1 - 1 ? -1 : 0), (z_3 > s1 - 1 ? -1 : 0) ]
			//       = [ (z_1 >= s1 ? -1 : 0), (z_3 >= s1 ? -1 : 0) ]
			const __m128i z1_lo = _mm_cmpgt_epi64(z1, lo);
			// z0_hi = [ (z_0 > s2 ? -1 : 0), (z_2 > s2 ? -1 : 0) ]
			const __m128i z0_hi = _mm_cmpgt_epi64(z0, hi);
			// z1_hi = [ (z_1 > s2 ? -1 : 0), (z_3 > s2 ? -1 : 0) ]
			const __m128i z1_hi = _mm_cmpgt_epi64(z1, hi);
			// s1, s2 と比較した結果の AND をとる
			// mask0 = [ ~z0_hi_0 & z0_lo_0, ~z0_hi_1 & z0_lo_1 ]
			//       = [ (s1 <= z_0 && z_0 < s2 ? -1 : 0), (s1 <= z_2 && z_2 < s2 ? -1 : 0) ]
			const __m128i mask0 = _mm_andnot_si128(z0_hi, z0_lo);
			// mask1 = [ ~z1_hi_0 & z1_lo_0, ~z1_hi_1 & z1_lo_1 ]
			//       = [ (s1 <= z_1 && z_1 < s2 ? -1 : 0), (s1 <= z_3 && z_3 < s2 ? -1 : 0) ]
			const __m128i mask1 = _mm_andnot_si128(z1_hi, z1_lo);
			// dx * dy が [ s1, s2 ] の範囲内であれば対応する要素に
			// -1 が入っているので合計からその値を引く。
			// sum = [ sum_0 - mask0_0, sum_1 - mask0_1 ]
			sum = _mm_sub_epi64(sum, mask0);
			// sum = [ sum_0 - mask1_0, sum_1 - mask1_1 ]
			sum = _mm_sub_epi64(sum, mask1);
		}
		// sum に入っている値を一旦メモリに書き出す
		int64_t buffer[2] __attribute__((aligned(16)));
		// buffer = [ sum_0, sum_1 ]
		_mm_store_si128((__m128i *)buffer, sum);
		// ベクトル型ではない方の答えに加算する
		answer += buffer[0] + buffer[1];
		// 端数処理 (4ずつ処理した残り)
		for(int j = i & ~3; j < i; ++j){
			const int64_t dx = x[i] - x[j], dy = y[i] - y[j];
			const int64_t z = dx * dy;
			if(s1 <= z && z <= s2){ ++answer; }
		}
	}
	cout << answer << endl;
	return 0;
}

Submission Info

Submission Time
Task H - 平和協定
User logicmachine
Language C++11 (GCC 4.8.1)
Score 100
Code Size 5100 Byte
Status AC
Exec Time 2144 ms
Memory 1312 KB

Judge Result

Set Name Sample Subtask1 Subtask2
Score / Max Score 0 / 0 10 / 10 90 / 90
Status
AC × 3
AC × 17
AC × 48
Set Name Test Cases
Sample sample_01.txt, sample_02.txt, sample_03.txt
Subtask1 sample_01.txt, sample_02.txt, sample_03.txt, subtask1_01.txt, subtask1_02.txt, subtask1_03.txt, subtask1_04.txt, subtask1_05.txt, subtask1_06.txt, subtask1_07.txt, subtask1_08.txt, subtask1_09.txt, subtask1_10.txt, subtask1_11.txt, subtask1_12.txt, subtask1_13.txt, subtask1_14.txt
Subtask2 sample_01.txt, sample_02.txt, sample_03.txt, subtask1_01.txt, subtask1_02.txt, subtask1_03.txt, subtask1_04.txt, subtask1_05.txt, subtask1_06.txt, subtask1_07.txt, subtask1_08.txt, subtask1_09.txt, subtask1_10.txt, subtask1_11.txt, subtask1_12.txt, subtask1_13.txt, subtask1_14.txt, subtask2_01.txt, subtask2_02.txt, subtask2_03.txt, subtask2_04.txt, subtask2_05.txt, subtask2_06.txt, subtask2_07.txt, subtask2_08.txt, subtask2_09.txt, subtask2_10.txt, subtask2_11.txt, subtask2_12.txt, subtask2_13.txt, subtask2_14.txt, subtask2_15.txt, subtask2_16.txt, subtask2_17.txt, subtask2_18.txt, subtask2_19.txt, subtask2_20.txt, subtask2_21.txt, subtask2_22.txt, subtask2_23.txt, subtask2_24.txt, subtask2_25.txt, subtask2_26.txt, subtask2_27.txt, subtask2_28.txt, subtask2_29.txt, subtask2_30.txt, subtask2_31.txt
Case Name Status Exec Time Memory
sample_01.txt AC 24 ms 924 KB
sample_02.txt AC 25 ms 796 KB
sample_03.txt AC 24 ms 796 KB
subtask1_01.txt AC 27 ms 800 KB
subtask1_02.txt AC 28 ms 800 KB
subtask1_03.txt AC 27 ms 932 KB
subtask1_04.txt AC 34 ms 736 KB
subtask1_05.txt AC 26 ms 808 KB
subtask1_06.txt AC 30 ms 800 KB
subtask1_07.txt AC 26 ms 928 KB
subtask1_08.txt AC 23 ms 924 KB
subtask1_09.txt AC 33 ms 924 KB
subtask1_10.txt AC 33 ms 924 KB
subtask1_11.txt AC 33 ms 800 KB
subtask1_12.txt AC 32 ms 756 KB
subtask1_13.txt AC 33 ms 800 KB
subtask1_14.txt AC 33 ms 924 KB
subtask2_01.txt AC 47 ms 796 KB
subtask2_02.txt AC 195 ms 932 KB
subtask2_03.txt AC 115 ms 796 KB
subtask2_04.txt AC 175 ms 932 KB
subtask2_05.txt AC 876 ms 1060 KB
subtask2_06.txt AC 750 ms 1052 KB
subtask2_07.txt AC 102 ms 804 KB
subtask2_08.txt AC 223 ms 924 KB
subtask2_09.txt AC 717 ms 928 KB
subtask2_10.txt AC 2129 ms 1184 KB
subtask2_11.txt AC 2127 ms 1188 KB
subtask2_12.txt AC 2130 ms 1180 KB
subtask2_13.txt AC 2127 ms 1184 KB
subtask2_14.txt AC 2126 ms 1176 KB
subtask2_15.txt AC 2129 ms 1184 KB
subtask2_16.txt AC 2144 ms 1188 KB
subtask2_17.txt AC 2132 ms 1312 KB
subtask2_18.txt AC 2122 ms 1172 KB
subtask2_19.txt AC 2126 ms 1180 KB
subtask2_20.txt AC 2133 ms 1184 KB
subtask2_21.txt AC 2125 ms 1180 KB
subtask2_22.txt AC 2131 ms 1192 KB
subtask2_23.txt AC 2128 ms 1188 KB
subtask2_24.txt AC 2130 ms 1184 KB
subtask2_25.txt AC 2125 ms 1184 KB
subtask2_26.txt AC 2123 ms 1152 KB
subtask2_27.txt AC 2112 ms 1188 KB
subtask2_28.txt AC 2128 ms 1180 KB
subtask2_29.txt AC 2133 ms 1192 KB
subtask2_30.txt AC 2104 ms 1120 KB
subtask2_31.txt AC 2134 ms 1184 KB