Submission #304634
Source Code Expand
#include <iostream>
#include <cstdint>
#include <emmintrin.h>
using namespace std;
//-----------------------------------------------------------------------------
// __attribute__((aligned(16)))
// 変数の先頭アドレスが16バイト境界となるように領域を確保する。
// GCCやClangで利用可能な拡張。VC++では__declspec(align(16))。
int32_t x[50000] __attribute__((aligned(16)));
int32_t y[50000] __attribute__((aligned(16)));
//-----------------------------------------------------------------------------
// SSE4.1で追加された命令のラッパー。
// AtCoderのコンパイルオプションだとintrinsicではSSE2までしか扱えないため。
// 64bit整数2つの比較 (greater than)
// [ a_0, a_1 ], [ b_0, b_1 ]
// => [ (a_0 > b_0 ? -1 : 0), (a_1 > b_1 ? -1 : 0) ]
static inline __m128i __attribute__((always_inline))
_mm_cmpgt_epi64(__m128i a, __m128i b){
__asm__("pcmpgtq %1, %0" : "+x" (a) : "xm" (b));
return a;
}
// 32bit整数2つの積 (64bitに拡張)
// [ a_0, a_1, a_2, a_3 ] * [ b_0, b_1, b_2, b_3 ]
// => [ (ll)a_0 * b_0, (ll)a_2 * b_2 ]
static inline __m128i __attribute__((always_inline))
_mm_mul_epi32(__m128i a, __m128i b){
__asm__("pmuldq %1, %0" : "+x" (a) : "xm" (b));
return a;
}
//-----------------------------------------------------------------------------
int main(){
ios_base::sync_with_stdio(false);
int n;
int64_t s1, s2;
cin >> n >> s1 >> s2;
for(int i = 0; i < n; ++i){ cin >> x[i] >> y[i]; }
int64_t answer = 0;
// 上限・下限のベクトル化
// lo = [ s1 - 1, s1 - 1 ]
const __m128i lo = _mm_set1_epi64x(s1 - 1);
// hi = [ s2, s2 ]
const __m128i hi = _mm_set1_epi64x(s2);
// 全部の組み合わせを試す
for(int i = 0; i < n; ++i){
// xi = [ x[i], x[i], x[i], x[i] ]
const __m128i xi = _mm_set1_epi32(x[i]);
// yi = [ y[i], y[i], y[i], y[i] ]
const __m128i yi = _mm_set1_epi32(y[i]);
// sum = [ 0, 0 ]
__m128i sum = _mm_setzero_si128();
// 1回のループで4要素を処理
for(int j = 0; j + 3 < i; j += 4){
// xj = [ x[j], x[j + 1], x[j + 2], x[j + 3] ]
const __m128i xj = _mm_load_si128((__m128i *)(x + j));
// yj = [ y[j], y[j + 1], y[j + 2], y[j + 3] ]
const __m128i yj = _mm_load_si128((__m128i *)(y + j));
// dx = [ x[i] - x[j], x[i] - x[j + 1], x[i] - x[j + 2], x[i] - x[j + 3] ]
const __m128i dx = _mm_sub_epi32(xi, xj);
// dy = [ y[i] - y[j], y[i] - y[j + 1], y[i] - y[j + 2], y[i] - y[j + 3] ]
const __m128i dy = _mm_sub_epi32(yi, yj);
// shift_dx = [ dx_1, dx_2, dx_3, 0 ]
const __m128i shift_dx = _mm_srli_si128(dx, 4); // 4バイト右シフト
// shift_dy = [ dy_1, dy_2, dy_3, 0 ]
const __m128i shift_dy = _mm_srli_si128(dy, 4); // 4バイト右シフト
// z = [ dx_0 * dy_0, dx_1 * dy_1, dx_2 * dy_2, dx_3 * dy_3 ]
// を2要素ずつz0, z1に分割する (64bit整数を扱うため)
// z0 = [ dx_0 * dy_0, dx_2 * dy_2 ]
const __m128i z0 = _mm_mul_epi32(dx, dy);
// z1 = [ dx_1 * dy_1, dx_3 * dy_3 ]
const __m128i z1 = _mm_mul_epi32(shift_dx, shift_dy);
// s1, s2 と各要素を比較する
// z0_lo = [ (z_0 > s1 - 1 ? -1 : 0), (z_2 > s1 - 1 ? -1 : 0) ]
// = [ (z_0 >= s1 ? -1 : 0), (z_2 >= s1 ? -1 : 0) ]
const __m128i z0_lo = _mm_cmpgt_epi64(z0, lo);
// z1_lo = [ (z_1 > s1 - 1 ? -1 : 0), (z_3 > s1 - 1 ? -1 : 0) ]
// = [ (z_1 >= s1 ? -1 : 0), (z_3 >= s1 ? -1 : 0) ]
const __m128i z1_lo = _mm_cmpgt_epi64(z1, lo);
// z0_hi = [ (z_0 > s2 ? -1 : 0), (z_2 > s2 ? -1 : 0) ]
const __m128i z0_hi = _mm_cmpgt_epi64(z0, hi);
// z1_hi = [ (z_1 > s2 ? -1 : 0), (z_3 > s2 ? -1 : 0) ]
const __m128i z1_hi = _mm_cmpgt_epi64(z1, hi);
// s1, s2 と比較した結果の AND をとる
// mask0 = [ ~z0_hi_0 & z0_lo_0, ~z0_hi_1 & z0_lo_1 ]
// = [ (s1 <= z_0 && z_0 < s2 ? -1 : 0), (s1 <= z_2 && z_2 < s2 ? -1 : 0) ]
const __m128i mask0 = _mm_andnot_si128(z0_hi, z0_lo);
// mask1 = [ ~z1_hi_0 & z1_lo_0, ~z1_hi_1 & z1_lo_1 ]
// = [ (s1 <= z_1 && z_1 < s2 ? -1 : 0), (s1 <= z_3 && z_3 < s2 ? -1 : 0) ]
const __m128i mask1 = _mm_andnot_si128(z1_hi, z1_lo);
// dx * dy が [ s1, s2 ] の範囲内であれば対応する要素に
// -1 が入っているので合計からその値を引く。
// sum = [ sum_0 - mask0_0, sum_1 - mask0_1 ]
sum = _mm_sub_epi64(sum, mask0);
// sum = [ sum_0 - mask1_0, sum_1 - mask1_1 ]
sum = _mm_sub_epi64(sum, mask1);
}
// sum に入っている値を一旦メモリに書き出す
int64_t buffer[2] __attribute__((aligned(16)));
// buffer = [ sum_0, sum_1 ]
_mm_store_si128((__m128i *)buffer, sum);
// ベクトル型ではない方の答えに加算する
answer += buffer[0] + buffer[1];
// 端数処理 (4ずつ処理した残り)
for(int j = i & ~3; j < i; ++j){
const int64_t dx = x[i] - x[j], dy = y[i] - y[j];
const int64_t z = dx * dy;
if(s1 <= z && z <= s2){ ++answer; }
}
}
cout << answer << endl;
return 0;
}
Submission Info
Submission Time |
|
Task |
H - 平和協定 |
User |
logicmachine |
Language |
C++11 (GCC 4.8.1) |
Score |
100 |
Code Size |
5100 Byte |
Status |
AC |
Exec Time |
2144 ms |
Memory |
1312 KB |
Judge Result
Set Name |
Sample |
Subtask1 |
Subtask2 |
Score / Max Score |
0 / 0 |
10 / 10 |
90 / 90 |
Status |
|
|
|
Set Name |
Test Cases |
Sample |
sample_01.txt, sample_02.txt, sample_03.txt |
Subtask1 |
sample_01.txt, sample_02.txt, sample_03.txt, subtask1_01.txt, subtask1_02.txt, subtask1_03.txt, subtask1_04.txt, subtask1_05.txt, subtask1_06.txt, subtask1_07.txt, subtask1_08.txt, subtask1_09.txt, subtask1_10.txt, subtask1_11.txt, subtask1_12.txt, subtask1_13.txt, subtask1_14.txt |
Subtask2 |
sample_01.txt, sample_02.txt, sample_03.txt, subtask1_01.txt, subtask1_02.txt, subtask1_03.txt, subtask1_04.txt, subtask1_05.txt, subtask1_06.txt, subtask1_07.txt, subtask1_08.txt, subtask1_09.txt, subtask1_10.txt, subtask1_11.txt, subtask1_12.txt, subtask1_13.txt, subtask1_14.txt, subtask2_01.txt, subtask2_02.txt, subtask2_03.txt, subtask2_04.txt, subtask2_05.txt, subtask2_06.txt, subtask2_07.txt, subtask2_08.txt, subtask2_09.txt, subtask2_10.txt, subtask2_11.txt, subtask2_12.txt, subtask2_13.txt, subtask2_14.txt, subtask2_15.txt, subtask2_16.txt, subtask2_17.txt, subtask2_18.txt, subtask2_19.txt, subtask2_20.txt, subtask2_21.txt, subtask2_22.txt, subtask2_23.txt, subtask2_24.txt, subtask2_25.txt, subtask2_26.txt, subtask2_27.txt, subtask2_28.txt, subtask2_29.txt, subtask2_30.txt, subtask2_31.txt |
Case Name |
Status |
Exec Time |
Memory |
sample_01.txt |
AC |
24 ms |
924 KB |
sample_02.txt |
AC |
25 ms |
796 KB |
sample_03.txt |
AC |
24 ms |
796 KB |
subtask1_01.txt |
AC |
27 ms |
800 KB |
subtask1_02.txt |
AC |
28 ms |
800 KB |
subtask1_03.txt |
AC |
27 ms |
932 KB |
subtask1_04.txt |
AC |
34 ms |
736 KB |
subtask1_05.txt |
AC |
26 ms |
808 KB |
subtask1_06.txt |
AC |
30 ms |
800 KB |
subtask1_07.txt |
AC |
26 ms |
928 KB |
subtask1_08.txt |
AC |
23 ms |
924 KB |
subtask1_09.txt |
AC |
33 ms |
924 KB |
subtask1_10.txt |
AC |
33 ms |
924 KB |
subtask1_11.txt |
AC |
33 ms |
800 KB |
subtask1_12.txt |
AC |
32 ms |
756 KB |
subtask1_13.txt |
AC |
33 ms |
800 KB |
subtask1_14.txt |
AC |
33 ms |
924 KB |
subtask2_01.txt |
AC |
47 ms |
796 KB |
subtask2_02.txt |
AC |
195 ms |
932 KB |
subtask2_03.txt |
AC |
115 ms |
796 KB |
subtask2_04.txt |
AC |
175 ms |
932 KB |
subtask2_05.txt |
AC |
876 ms |
1060 KB |
subtask2_06.txt |
AC |
750 ms |
1052 KB |
subtask2_07.txt |
AC |
102 ms |
804 KB |
subtask2_08.txt |
AC |
223 ms |
924 KB |
subtask2_09.txt |
AC |
717 ms |
928 KB |
subtask2_10.txt |
AC |
2129 ms |
1184 KB |
subtask2_11.txt |
AC |
2127 ms |
1188 KB |
subtask2_12.txt |
AC |
2130 ms |
1180 KB |
subtask2_13.txt |
AC |
2127 ms |
1184 KB |
subtask2_14.txt |
AC |
2126 ms |
1176 KB |
subtask2_15.txt |
AC |
2129 ms |
1184 KB |
subtask2_16.txt |
AC |
2144 ms |
1188 KB |
subtask2_17.txt |
AC |
2132 ms |
1312 KB |
subtask2_18.txt |
AC |
2122 ms |
1172 KB |
subtask2_19.txt |
AC |
2126 ms |
1180 KB |
subtask2_20.txt |
AC |
2133 ms |
1184 KB |
subtask2_21.txt |
AC |
2125 ms |
1180 KB |
subtask2_22.txt |
AC |
2131 ms |
1192 KB |
subtask2_23.txt |
AC |
2128 ms |
1188 KB |
subtask2_24.txt |
AC |
2130 ms |
1184 KB |
subtask2_25.txt |
AC |
2125 ms |
1184 KB |
subtask2_26.txt |
AC |
2123 ms |
1152 KB |
subtask2_27.txt |
AC |
2112 ms |
1188 KB |
subtask2_28.txt |
AC |
2128 ms |
1180 KB |
subtask2_29.txt |
AC |
2133 ms |
1192 KB |
subtask2_30.txt |
AC |
2104 ms |
1120 KB |
subtask2_31.txt |
AC |
2134 ms |
1184 KB |