This seems to me like a bug in gcc. From the following analysis
(start reading at 0x400a38), the value loaded from memory is never
used -- xmm12 is completely overwritten by subsequent instructions,
either in the post-loop block, or in the first instruction of the
next iteration.
==12860== Invalid read of size 8
==12860== at 0x400A38: integrate_gf_npbc_
This seems to me like a bug in gcc. From the following analysis
(start reading at 0x400a38), the value loaded from memory is never
used -- xmm12 is completely overwritten by subsequent instructions,
either in the post-loop block, or in the first instruction of the
next iteration.
==12860== Invalid read of size 8
==12860== at 0x400A38: integrate_gf_npbc_
# def xmm12 (low half loaded, high half zeroed) rdx,1), %xmm12 %rdx,1) ,%xmm12 %rdx,1) ,%xmm0
4009d8: f2 44 0f 10 24 16 movsd (%rsi,%
4009de: 41 83 c6 01 add $0x1,%r14d
4009e2: f2 0f 10 31 movsd (%rcx),%xmm6
4009e6: 66 44 0f 16 64 16 08 movhpd 0x8(%rsi,
4009ed: f2 41 0f 10 04 17 movsd (%r15,%rdx,1),%xmm0
4009f3: 66 0f 16 71 08 movhpd 0x8(%rcx),%xmm6
4009f8: 66 41 0f 28 dc movapd %xmm12,%xmm3
4009fd: f2 44 0f 10 61 10 movsd 0x10(%rcx),%xmm12
400a03: 66 0f 28 ce movapd %xmm6,%xmm1
400a07: 66 41 0f 16 44 17 08 movhpd 0x8(%r15,
400a0e: 66 44 0f 16 61 18 movhpd 0x18(%rcx),%xmm12
400a14: f2 0f 10 33 movsd (%rbx),%xmm6
400a18: 66 0f 28 d0 movapd %xmm0,%xmm2
400a1c: 48 83 c2 10 add $0x10,%rdx
400a20: 66 41 0f 14 cc unpcklpd %xmm12,%xmm1
400a25: 66 0f 16 73 08 movhpd 0x8(%rbx),%xmm6
400a2a: f2 44 0f 10 63 10 movsd 0x10(%rbx),%xmm12
400a30: 48 83 c1 20 add $0x20,%rcx
400a34: 66 0f 28 c6 movapd %xmm6,%xmm0
# load high half xmm12 (error reported here). low half unchanged.
400a38: 66 44 0f 16 63 18 movhpd 0x18(%rbx),%xmm12
400a3e: 66 0f 28 f1 movapd %xmm1,%xmm6
400a42: 66 0f 59 ca mulpd %xmm2,%xmm1
400a46: 48 83 c3 20 add $0x20,%rbx
400a4a: 41 39 ee cmp %ebp,%r14d
# reads low half xmm12 only
400a4d: 66 41 0f 14 c4 unpcklpd %xmm12,%xmm0
400a52: 66 0f 59 f3 mulpd %xmm3,%xmm6
400a56: 66 0f 59 d8 mulpd %xmm0,%xmm3
400a5a: 66 0f 58 f9 addpd %xmm1,%xmm7
400a5e: 66 0f 59 c2 mulpd %xmm2,%xmm0
400a62: 66 44 0f 58 de addpd %xmm6,%xmm11
400a67: 66 0f 58 eb addpd %xmm3,%xmm5
400a6b: 66 0f 58 e0 addpd %xmm0,%xmm4
400a6f: 0f 82 63 ff ff ff jb 4009d8 # (loop head)
400a75: 66 0f 28 c4 movapd %xmm4,%xmm0
400a79: 8b 54 24 a8 mov -0x58(%rsp),%edx
# def xmm12 (overwrite both halves)
400a7d: 66 44 0f 28 e7 movapd %xmm7,%xmm12