libm_reduce_pio2.S 14 KB


  1. /*
  2. * Math library
  3. *
  4. * Copyright (C) 2016 Intel Corporation. All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. *
  10. * * Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * * Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in
  14. * the documentation and/or other materials provided with the
  15. * distribution.
  16. * * Neither the name of Intel Corporation nor the names of its
  17. * contributors may be used to endorse or promote products derived
  18. * from this software without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. *
  32. *
  33. * Author Name <jingwei.zhang@intel.com>
  34. * History:
  35. * 03-14-2016 Initial version. numerics svn rev. 12864
  36. */
  37. .file "libm_reduce_pio2.c"
  38. .text
  39. ..TXTST0:
  40. # -- Begin __libm_reduce_pio2d
  41. .text
  42. .align 16,0x90
  43. .globl __libm_reduce_pio2d
  44. __libm_reduce_pio2d:
  45. # parameter 1: %xmm0
  46. # parameter 2: %rdi
  47. ..B1.1:
  48. .cfi_startproc
  49. ..___tag_value___libm_reduce_pio2d.1:
  50. ..L2:
  51. movq %rdi, %r9
  52. movsd %xmm0, -8(%rsp)
  53. movzbl -1(%rsp), %eax
  54. movl %eax, %r8d
  55. andl $127, %eax
  56. andl $128, %r8d
  57. movb %al, -1(%rsp)
  58. movzwl -2(%rsp), %eax
  59. movl %eax, %edx
  60. andl $32752, %edx
  61. shrl $4, %edx
  62. shrl $7, %r8d
  63. cmpl $1053, %edx
  64. jge ..B1.9
  65. ..B1.2:
  66. lea _PI04_INV(%rip), %rax
  67. movsd -8(%rsp), %xmm0
  68. movsd (%rax), %xmm1
  69. mulsd %xmm0, %xmm1
  70. movsd %xmm1, -16(%rsp)
  71. movzwl -10(%rsp), %ecx
  72. andl $32752, %ecx
  73. shrl $4, %ecx
  74. cmpl $1023, %ecx
  75. jl ..B1.4
  76. ..B1.3:
  77. movl -12(%rsp), %esi
  78. negl %ecx
  79. orl $-1048576, %esi
  80. addl $30, %ecx
  81. movl -16(%rsp), %eax
  82. shll $11, %esi
  83. shrl $21, %eax
  84. orl %eax, %esi
  85. shrl %cl, %esi
  86. jmp ..B1.5
  87. ..B1.4:
  88. xorl %esi, %esi
  89. ..B1.5:
  90. movl %esi, %ecx
  91. lea (%r8,%r8), %eax
  92. andl $1, %ecx
  93. negl %eax
  94. addl %esi, %ecx
  95. incl %esi
  96. shrl $1, %esi
  97. incl %eax
  98. imull %esi, %eax
  99. pxor %xmm7, %xmm7
  100. cvtsi2sd %ecx, %xmm7
  101. andl $3, %eax
  102. cmpl $1046, %edx
  103. jge ..B1.7
  104. ..B1.6:
  105. lea _PI04_29x4(%rip), %rdx
  106. lea 8+_PI04_29x4(%rip), %rcx
  107. lea 16+_PI04_29x4(%rip), %rsi
  108. lea 24+_PI04_29x4(%rip), %rdi
  109. movsd (%rdx), %xmm1
  110. mulsd %xmm7, %xmm1
  111. movsd (%rcx), %xmm2
  112. subsd %xmm1, %xmm0
  113. mulsd %xmm7, %xmm2
  114. movaps %xmm0, %xmm4
  115. movsd (%rsi), %xmm3
  116. subsd %xmm2, %xmm4
  117. mulsd %xmm7, %xmm3
  118. subsd %xmm4, %xmm0
  119. movsd (%rdi), %xmm5
  120. subsd %xmm2, %xmm0
  121. mulsd %xmm5, %xmm7
  122. movaps %xmm4, %xmm2
  123. subsd %xmm3, %xmm2
  124. subsd %xmm2, %xmm4
  125. subsd %xmm3, %xmm4
  126. addsd %xmm4, %xmm0
  127. subsd %xmm7, %xmm0
  128. jmp ..B1.8
  129. ..B1.7:
  130. lea _PI04_21x5(%rip), %rdx
  131. lea 8+_PI04_21x5(%rip), %rcx
  132. lea 16+_PI04_21x5(%rip), %rsi
  133. lea 24+_PI04_21x5(%rip), %rdi
  134. lea 32+_PI04_21x5(%rip), %r10
  135. movsd (%rdx), %xmm1
  136. movsd (%rcx), %xmm2
  137. mulsd %xmm7, %xmm1
  138. mulsd %xmm7, %xmm2
  139. subsd %xmm1, %xmm0
  140. movaps %xmm0, %xmm4
  141. movsd (%rsi), %xmm3
  142. subsd %xmm2, %xmm4
  143. mulsd %xmm7, %xmm3
  144. subsd %xmm4, %xmm0
  145. movaps %xmm4, %xmm8
  146. subsd %xmm2, %xmm0
  147. subsd %xmm3, %xmm8
  148. movaps %xmm8, %xmm5
  149. subsd %xmm8, %xmm4
  150. movsd (%r10), %xmm6
  151. subsd %xmm3, %xmm4
  152. addsd %xmm4, %xmm0
  153. addsd %xmm0, %xmm5
  154. movaps %xmm5, %xmm2
  155. subsd %xmm5, %xmm8
  156. addsd %xmm0, %xmm8
  157. movsd (%rdi), %xmm0
  158. mulsd %xmm7, %xmm0
  159. mulsd %xmm6, %xmm7
  160. subsd %xmm0, %xmm2
  161. subsd %xmm2, %xmm5
  162. subsd %xmm0, %xmm5
  163. addsd %xmm5, %xmm8
  164. subsd %xmm7, %xmm8
  165. movaps %xmm8, %xmm0
  166. ..B1.8:
  167. movl %r8d, %r8d
  168. lea _ones(%rip), %rdx
  169. movsd (%rdx,%r8,8), %xmm1
  170. mulsd %xmm1, %xmm2
  171. mulsd %xmm1, %xmm0
  172. movaps %xmm2, %xmm3
  173. addsd %xmm0, %xmm3
  174. movsd %xmm3, (%r9)
  175. subsd %xmm3, %xmm2
  176. addsd %xmm0, %xmm2
  177. movsd %xmm2, 8(%r9)
  178. ret
  179. ..B1.9:
  180. andl $-32753, %eax
  181. lea -200(%rdx), %r10d
  182. andl $2047, %r10d
  183. shll $4, %r10d
  184. orl %r10d, %eax
  185. movw %ax, -2(%rsp)
  186. lea -1052(%rdx), %eax
  187. imull $83886, %eax, %edi
  188. movsd -8(%rsp), %xmm2
  189. movsd %xmm2, -24(%rsp)
  190. sarl $21, %edi
  191. movslq %edi, %rdi
  192. imull $-25, %edi, %esi
  193. movl -8(%rsp), %r11d
  194. lea -1052(%rsi,%rdx), %ecx
  195. andl $-134217728, %r11d
  196. lea _DP(%rip), %rsi
  197. movl %r11d, -24(%rsp)
  198. negl %ecx
  199. movsd -24(%rsp), %xmm1
  200. addl $38, %ecx
  201. movaps %xmm1, %xmm3
  202. movq $-1, %rdx
  203. movsd (%rsi,%rdi,8), %xmm0
  204. movaps %xmm1, %xmm7
  205. movsd 8(%rsi,%rdi,8), %xmm5
  206. movaps %xmm1, %xmm14
  207. mulsd %xmm5, %xmm3
  208. subsd %xmm1, %xmm2
  209. mulsd %xmm2, %xmm0
  210. mulsd %xmm2, %xmm5
  211. movaps %xmm0, %xmm6
  212. shlq %cl, %rdx
  213. addsd %xmm3, %xmm6
  214. movaps %xmm6, %xmm4
  215. subsd %xmm6, %xmm0
  216. movsd 16(%rsi,%rdi,8), %xmm12
  217. addsd %xmm3, %xmm0
  218. mulsd %xmm12, %xmm7
  219. mulsd %xmm2, %xmm12
  220. addsd %xmm0, %xmm4
  221. movd %xmm4, %r10
  222. movsd 24(%rsi,%rdi,8), %xmm3
  223. mulsd %xmm3, %xmm14
  224. mulsd %xmm2, %xmm3
  225. andq %rdx, %r10
  226. movq %r10, -16(%rsp)
  227. subsd -16(%rsp), %xmm6
  228. movaps %xmm6, %xmm8
  229. addsd %xmm5, %xmm8
  230. movaps %xmm8, %xmm10
  231. subsd %xmm8, %xmm6
  232. addsd %xmm7, %xmm10
  233. addsd %xmm5, %xmm6
  234. subsd %xmm10, %xmm8
  235. addsd %xmm6, %xmm0
  236. addsd %xmm7, %xmm8
  237. movaps %xmm10, %xmm9
  238. movaps %xmm1, %xmm5
  239. movsd 32(%rsi,%rdi,8), %xmm7
  240. addsd %xmm8, %xmm0
  241. mulsd %xmm7, %xmm5
  242. mulsd %xmm2, %xmm7
  243. addsd %xmm0, %xmm9
  244. movaps %xmm0, %xmm13
  245. addsd %xmm12, %xmm9
  246. addsd %xmm14, %xmm9
  247. movsd %xmm9, -16(%rsp)
  248. movzwl -10(%rsp), %ecx
  249. shrl $4, %ecx
  250. movd %xmm9, %rax
  251. negl %ecx
  252. movaps %xmm1, %xmm9
  253. addl $51, %ecx
  254. sarq %cl, %rax
  255. movl %eax, %edx
  256. shlq %cl, %rax
  257. movl %edx, %r11d
  258. movq %rax, -16(%rsp)
  259. lea _zero_none(%rip), %rax
  260. andl $1, %r11d
  261. incl %edx
  262. shrl $1, %edx
  263. lea _PI04_25x2(%rip), %rcx
  264. subsd -16(%rsp), %xmm10
  265. movsd (%rax,%r11,8), %xmm11
  266. lea (%r8,%r8), %eax
  267. negl %eax
  268. addsd %xmm10, %xmm13
  269. incl %eax
  270. subsd %xmm13, %xmm10
  271. imull %edx, %eax
  272. addsd %xmm11, %xmm13
  273. addsd %xmm10, %xmm0
  274. movaps %xmm13, %xmm15
  275. movaps %xmm1, %xmm11
  276. lea _TWO_26H(%rip), %rdx
  277. andl $3, %eax
  278. addsd %xmm12, %xmm15
  279. movaps %xmm15, %xmm4
  280. subsd %xmm15, %xmm13
  281. addsd %xmm14, %xmm4
  282. addsd %xmm12, %xmm13
  283. subsd %xmm4, %xmm15
  284. addsd %xmm13, %xmm0
  285. addsd %xmm14, %xmm15
  286. movaps %xmm4, %xmm6
  287. movaps %xmm1, %xmm14
  288. movsd 40(%rsi,%rdi,8), %xmm12
  289. addsd %xmm3, %xmm6
  290. mulsd %xmm12, %xmm9
  291. addsd %xmm15, %xmm0
  292. mulsd %xmm2, %xmm12
  293. subsd %xmm6, %xmm4
  294. movaps %xmm6, %xmm8
  295. addsd %xmm3, %xmm4
  296. addsd %xmm5, %xmm8
  297. addsd %xmm4, %xmm0
  298. subsd %xmm8, %xmm6
  299. movaps %xmm8, %xmm10
  300. addsd %xmm5, %xmm6
  301. addsd %xmm7, %xmm10
  302. addsd %xmm6, %xmm0
  303. subsd %xmm10, %xmm8
  304. movsd 48(%rsi,%rdi,8), %xmm3
  305. movaps %xmm10, %xmm13
  306. mulsd %xmm3, %xmm11
  307. addsd %xmm9, %xmm13
  308. addsd %xmm7, %xmm8
  309. mulsd %xmm2, %xmm3
  310. addsd %xmm11, %xmm12
  311. subsd %xmm13, %xmm10
  312. addsd %xmm8, %xmm0
  313. addsd %xmm9, %xmm10
  314. movsd 56(%rsi,%rdi,8), %xmm15
  315. movaps %xmm13, %xmm4
  316. mulsd %xmm15, %xmm14
  317. addsd %xmm12, %xmm4
  318. addsd %xmm10, %xmm0
  319. mulsd %xmm15, %xmm2
  320. addsd %xmm14, %xmm3
  321. subsd %xmm4, %xmm13
  322. movaps %xmm4, %xmm5
  323. addsd %xmm12, %xmm13
  324. addsd %xmm3, %xmm5
  325. addsd %xmm13, %xmm0
  326. subsd %xmm5, %xmm4
  327. addsd %xmm3, %xmm4
  328. movsd 64(%rsi,%rdi,8), %xmm3
  329. mulsd %xmm3, %xmm1
  330. addsd %xmm4, %xmm0
  331. addsd %xmm1, %xmm2
  332. movaps %xmm2, %xmm3
  333. lea 8+_PI04_25x2(%rip), %rsi
  334. movsd (%rdx), %xmm1
  335. lea _ones(%rip), %rdx
  336. addsd %xmm5, %xmm3
  337. mulsd %xmm3, %xmm1
  338. addsd %xmm1, %xmm3
  339. subsd %xmm1, %xmm3
  340. subsd %xmm3, %xmm5
  341. addsd %xmm2, %xmm5
  342. movsd (%rcx), %xmm2
  343. addsd %xmm5, %xmm0
  344. movaps %xmm0, %xmm1
  345. addsd %xmm3, %xmm0
  346. mulsd %xmm2, %xmm1
  347. mulsd %xmm2, %xmm3
  348. mulsd (%rsi), %xmm0
  349. addsd %xmm0, %xmm1
  350. movaps %xmm1, %xmm0
  351. addsd %xmm3, %xmm0
  352. subsd %xmm0, %xmm3
  353. addsd %xmm1, %xmm3
  354. movsd (%rdx,%r8,8), %xmm1
  355. mulsd %xmm1, %xmm0
  356. mulsd %xmm1, %xmm3
  357. movsd %xmm0, (%r9)
  358. movsd %xmm3, 8(%r9)
  359. ret
  360. .align 16,0x90
  361. .cfi_endproc
  362. .type __libm_reduce_pio2d,@function
  363. .size __libm_reduce_pio2d,.-__libm_reduce_pio2d
  364. .data
  365. # -- End __libm_reduce_pio2d
  366. .section .rodata, "a"
  367. .align 16
  368. .align 16
  369. _PI04_INV:
  370. .long 1841940611
  371. .long 1072979760
  372. .type _PI04_INV,@object
  373. .size _PI04_INV,8
  374. .space 8, 0x00 # pad
  375. .align 16
  376. _PI04_29x4:
  377. .long 1409286144
  378. .long 1072243195
  379. .long 301989888
  380. .long 1040255814
  381. .long 1006632960
  382. .long 3156637299
  383. .long 2207917344
  384. .long 979464219
  385. .type _PI04_29x4,@object
  386. .size _PI04_29x4,32
  387. .align 16
  388. _PI04_21x5:
  389. .long 0
  390. .long 1072243194
  391. .long 0
  392. .long 1051018307
  393. .long 0
  394. .long 3174514122
  395. .long 0
  396. .long 3153310618
  397. .long 3773204808
  398. .long 981752838
  399. .type _PI04_21x5,@object
  400. .size _PI04_21x5,40
  401. .space 8, 0x00 # pad
  402. .align 16
  403. _ones:
  404. .long 0
  405. .long 1072693248
  406. .long 0
  407. .long 3220176896
  408. .type _ones,@object
  409. .size _ones,16
  410. .align 16
  411. _DP:
  412. .long 0
  413. .long 0
  414. .long 1610612736
  415. .long 1282694960
  416. .long 0
  417. .long 1256952721
  418. .long 536870912
  419. .long 1229269500
  420. .long 3221225472
  421. .long 1202544455
  422. .long 0
  423. .long 1176818551
  424. .long 2147483648
  425. .long 1148939346
  426. .long 536870912
  427. .long 1124701124
  428. .long 3758096384
  429. .long 1099498527
  430. .long 3758096384
  431. .long 1071929578
  432. .long 1342177280
  433. .long 1046982385
  434. .long 805306368
  435. .long 1020320658
  436. .long 2147483648
  437. .long 993817732
  438. .long 0
  439. .long 968598976
  440. .long 2684354560
  441. .long 942220475
  442. .long 2415919104
  443. .long 915426956
  444. .long 0
  445. .long 885849629
  446. .long 536870912
  447. .long 863855510
  448. .long 1610612736
  449. .long 836031391
  450. .long 4026531840
  451. .long 810828058
  452. .long 1073741824
  453. .long 784674491
  454. .long 1610612736
  455. .long 757207974
  456. .long 3489660928
  457. .long 732020890
  458. .long 0
  459. .long 703061904
  460. .long 1610612736
  461. .long 679713053
  462. .long 2147483648
  463. .long 652001705
  464. .long 1073741824
  465. .long 626850382
  466. .long 2147483648
  467. .long 597786158
  468. .long 805306368
  469. .long 575535400
  470. .long 536870912
  471. .long 548814833
  472. .long 268435456
  473. .long 523239288
  474. .long 3758096384
  475. .long 495550718
  476. .long 2952790016
  477. .long 469954840
  478. .long 1073741824
  479. .long 442925723
  480. .long 1073741824
  481. .long 416247094
  482. .long 3758096384
  483. .long 392128403
  484. .long 2147483648
  485. .long 364254062
  486. .long 3221225472
  487. .long 339643518
  488. .long 2684354560
  489. .long 313162111
  490. .long 805306368
  491. .long 286354345
  492. .long 2952790016
  493. .long 260811902
  494. .long 1610612736
  495. .long 234667567
  496. .long 3758096384
  497. .long 207520668
  498. .long 1073741824
  499. .long 182175017
  500. .long 4026531840
  501. .long 155380331
  502. .long 805306368
  503. .long 129417058
  504. .long 536870912
  505. .long 103691636
  506. .long 0
  507. .long 73760972
  508. .long 3221225472
  509. .long 48348958
  510. .long 536870912
  511. .long 23784188
  512. .type _DP,@object
  513. .size _DP,400
  514. .align 16
  515. _zero_none:
  516. .long 0
  517. .long 0
  518. .long 0
  519. .long 3220176896
  520. .type _zero_none,@object
  521. .size _zero_none,16
  522. .align 16
  523. _PI04_25x2:
  524. .long 1073741824
  525. .long 1072243195
  526. .long 407279769
  527. .long 1046758445
  528. .type _PI04_25x2,@object
  529. .size _PI04_25x2,16
  530. .align 16
  531. _TWO_26H:
  532. .long 0
  533. .long 1100480512
  534. .type _TWO_26H,@object
  535. .size _TWO_26H,8
  536. .data
  537. .section .note.GNU-stack, ""
  538. // -- Begin DWARF2 SEGMENT .eh_frame
  539. .section .eh_frame,"a",@progbits
  540. .eh_frame_seg:
  541. .align 1
  542. # End