tanh_gen.S 15 KB


  1. /*
  2. * Math library
  3. *
  4. * Copyright (C) 2016 Intel Corporation. All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. *
  10. * * Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * * Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in
  14. * the documentation and/or other materials provided with the
  15. * distribution.
  16. * * Neither the name of Intel Corporation nor the names of its
  17. * contributors may be used to endorse or promote products derived
  18. * from this software without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. *
  32. *
  33. * Author Name <jingwei.zhang@intel.com>
  34. * History:
  35. * 03-14-2016 Initial version. numerics svn rev. 12864
  36. */
  37. .file "tanh_gen.c"
  38. .text
  39. ..TXTST0:
  40. # -- Begin tanh
  41. .text
  42. .align 16,0x90
  43. .globl tanh
  44. tanh:
  45. # parameter 1: %xmm0
  46. ..B1.1:
  47. .cfi_startproc
  48. ..___tag_value_tanh.1:
  49. ..L2:
  50. movsd %xmm0, -8(%rsp)
  51. movl -4(%rsp), %eax
  52. movl %eax, %ecx
  53. andl $2147483647, %ecx
  54. cmpl $2146435072, %ecx
  55. jae ..B1.16
  56. ..B1.2:
  57. cmpl $1077088193, %ecx
  58. jae ..B1.15
  59. ..B1.3:
  60. cmpl $1075078759, %ecx
  61. jae ..B1.14
  62. ..B1.4:
  63. cmpl $1066695393, %ecx
  64. jae ..B1.13
  65. ..B1.5:
  66. cmpl $1012924416, %ecx
  67. jae ..B1.12
  68. ..B1.6:
  69. shrl $31, %eax
  70. cmpl $1048576, %ecx
  71. jb ..B1.8
  72. ..B1.7:
  73. movl %eax, %eax
  74. lea _small_value_64(%rip), %rdx
  75. movsd (%rdx,%rax,8), %xmm1
  76. mulsd (%rdx), %xmm1
  77. subsd %xmm1, %xmm0
  78. ret
  79. ..B1.8:
  80. movl -8(%rsp), %edx
  81. orl %edx, %ecx
  82. je ..B1.10
  83. ..B1.9:
  84. lea _small_value_64(%rip), %rdx
  85. movsd -8(%rsp), %xmm0
  86. movsd (%rdx,%rax,8), %xmm1
  87. mulsd (%rdx), %xmm1
  88. movsd %xmm1, -32(%rsp)
  89. subsd %xmm1, %xmm0
  90. ret
  91. ..B1.10:
  92. movsd -8(%rsp), %xmm0
  93. ..B1.11:
  94. ret
  95. ..B1.12:
  96. movsd -8(%rsp), %xmm4
  97. movaps %xmm4, %xmm2
  98. mulsd %xmm4, %xmm2
  99. movaps %xmm2, %xmm1
  100. mulsd %xmm2, %xmm1
  101. movsd .L_2il0floatpacket.20(%rip), %xmm0
  102. movsd .L_2il0floatpacket.22(%rip), %xmm3
  103. mulsd %xmm1, %xmm0
  104. mulsd %xmm1, %xmm3
  105. addsd .L_2il0floatpacket.21(%rip), %xmm0
  106. addsd .L_2il0floatpacket.23(%rip), %xmm3
  107. mulsd %xmm1, %xmm0
  108. mulsd %xmm2, %xmm3
  109. addsd %xmm3, %xmm0
  110. mulsd %xmm4, %xmm0
  111. addsd %xmm4, %xmm0
  112. ret
  113. ..B1.13:
  114. movsd -8(%rsp), %xmm0
  115. lea _TWO_52(%rip), %rdx
  116. andps .L_2il0floatpacket.24(%rip), %xmm0
  117. xorl %r10d, %r10d
  118. mulsd .L_2il0floatpacket.10(%rip), %xmm0
  119. movq $0x3ff0000000000000, %rsi
  120. movsd .L_2il0floatpacket.11(%rip), %xmm4
  121. lea _TWO_32(%rip), %r9
  122. mulsd %xmm0, %xmm4
  123. lea ones(%rip), %r11
  124. movsd .L_2il0floatpacket.12(%rip), %xmm5
  125. movsd .L_2il0floatpacket.13(%rip), %xmm6
  126. movsd .L_2il0floatpacket.25(%rip), %xmm3
  127. movaps %xmm3, %xmm8
  128. addsd (%rdx), %xmm4
  129. movsd %xmm4, -32(%rsp)
  130. movsd -32(%rsp), %xmm7
  131. movsd .L_2il0floatpacket.15(%rip), %xmm12
  132. movsd .L_2il0floatpacket.17(%rip), %xmm14
  133. movsd .L_2il0floatpacket.14(%rip), %xmm2
  134. movaps %xmm2, %xmm9
  135. subsd (%rdx), %xmm7
  136. mulsd %xmm7, %xmm5
  137. mulsd %xmm6, %xmm7
  138. subsd %xmm5, %xmm0
  139. movaps %xmm0, %xmm10
  140. movaps %xmm3, %xmm6
  141. movl -32(%rsp), %ecx
  142. movl %ecx, %edi
  143. shll $25, %edi
  144. subsd %xmm7, %xmm10
  145. movaps %xmm10, %xmm11
  146. subsd %xmm10, %xmm0
  147. addsd %xmm10, %xmm8
  148. subsd %xmm7, %xmm0
  149. movsd %xmm8, -40(%rsp)
  150. addsd %xmm0, %xmm11
  151. movaps %xmm11, %xmm13
  152. mulsd %xmm11, %xmm13
  153. mulsd %xmm13, %xmm12
  154. mulsd %xmm13, %xmm14
  155. addsd .L_2il0floatpacket.16(%rip), %xmm12
  156. addsd .L_2il0floatpacket.18(%rip), %xmm14
  157. mulsd %xmm13, %xmm12
  158. mulsd %xmm13, %xmm14
  159. mulsd %xmm11, %xmm12
  160. addsd .L_2il0floatpacket.19(%rip), %xmm14
  161. movl %r10d, -40(%rsp)
  162. movsd -40(%rsp), %xmm4
  163. mulsd %xmm13, %xmm14
  164. addsd %xmm4, %xmm9
  165. sarl $25, %edi
  166. subsd %xmm9, %xmm10
  167. subl %edi, %ecx
  168. addsd %xmm10, %xmm0
  169. movslq %edi, %rdi
  170. addsd %xmm12, %xmm0
  171. shlq $4, %rdi
  172. addsd %xmm14, %xmm0
  173. movq __libm_exp_table_128@GOTPCREL(%rip), %r8
  174. shrl $7, %ecx
  175. movsd (%r9), %xmm5
  176. movsd 1032(%r8,%rdi), %xmm15
  177. movaps %xmm15, %xmm1
  178. mulsd %xmm0, %xmm1
  179. addsd %xmm4, %xmm0
  180. mulsd %xmm15, %xmm4
  181. mulsd 1024(%r8,%rdi), %xmm0
  182. shlq $52, %rcx
  183. addsd %xmm0, %xmm1
  184. addq %rsi, %rcx
  185. movaps %xmm2, %xmm0
  186. movq %rcx, -16(%rsp)
  187. movsd -16(%rsp), %xmm7
  188. mulsd %xmm7, %xmm1
  189. mulsd %xmm4, %xmm7
  190. movaps %xmm7, %xmm10
  191. addsd %xmm7, %xmm0
  192. addsd %xmm7, %xmm6
  193. addsd (%r9), %xmm0
  194. addsd %xmm5, %xmm6
  195. movsd %xmm0, -32(%rsp)
  196. movsd -32(%rsp), %xmm0
  197. movsd %xmm6, -32(%rsp)
  198. subsd %xmm5, %xmm0
  199. movsd -32(%rsp), %xmm9
  200. subsd %xmm0, %xmm10
  201. subsd (%r9), %xmm9
  202. addsd %xmm2, %xmm10
  203. movsd %xmm7, -40(%rsp)
  204. movaps %xmm9, %xmm8
  205. shrl $31, %eax
  206. subsd %xmm9, %xmm7
  207. addsd %xmm1, %xmm10
  208. addsd %xmm3, %xmm7
  209. movsd (%r11,%rax,8), %xmm12
  210. addsd %xmm7, %xmm1
  211. addsd %xmm1, %xmm8
  212. divsd %xmm8, %xmm3
  213. movsd %xmm3, -24(%rsp)
  214. movl %r10d, -24(%rsp)
  215. movsd -24(%rsp), %xmm11
  216. mulsd %xmm11, %xmm1
  217. subsd %xmm11, %xmm3
  218. mulsd %xmm11, %xmm9
  219. mulsd %xmm3, %xmm8
  220. addsd %xmm2, %xmm9
  221. addsd %xmm8, %xmm1
  222. movaps %xmm11, %xmm2
  223. addsd %xmm1, %xmm9
  224. addsd %xmm3, %xmm2
  225. mulsd %xmm9, %xmm2
  226. movaps %xmm0, %xmm1
  227. mulsd %xmm11, %xmm0
  228. subsd %xmm2, %xmm3
  229. mulsd %xmm3, %xmm1
  230. addsd %xmm11, %xmm3
  231. mulsd %xmm3, %xmm10
  232. movsd %xmm0, -8(%rsp)
  233. addsd %xmm10, %xmm1
  234. mulsd %xmm12, %xmm0
  235. mulsd %xmm1, %xmm12
  236. addsd %xmm12, %xmm0
  237. ret
  238. ..B1.14:
  239. lea _TWO_32(%rip), %rdx
  240. lea _TWO_52(%rip), %rcx
  241. movsd -8(%rsp), %xmm1
  242. movq $0x3ff0000000000000, %rsi
  243. andps .L_2il0floatpacket.24(%rip), %xmm1
  244. lea twos(%rip), %r8
  245. movsd .L_2il0floatpacket.0(%rip), %xmm3
  246. movaps %xmm1, %xmm7
  247. movsd (%rdx), %xmm0
  248. lea ones(%rip), %r9
  249. movsd (%rcx), %xmm4
  250. addsd %xmm1, %xmm0
  251. movsd %xmm0, -32(%rsp)
  252. movsd -32(%rsp), %xmm6
  253. movsd .L_2il0floatpacket.1(%rip), %xmm2
  254. mulsd %xmm1, %xmm2
  255. subsd (%rdx), %xmm6
  256. movsd .L_2il0floatpacket.2(%rip), %xmm12
  257. subsd %xmm6, %xmm7
  258. mulsd %xmm3, %xmm6
  259. mulsd %xmm3, %xmm7
  260. addsd %xmm6, %xmm4
  261. addsd %xmm2, %xmm7
  262. movsd %xmm4, -32(%rsp)
  263. movsd -32(%rsp), %xmm5
  264. movsd .L_2il0floatpacket.6(%rip), %xmm9
  265. movsd .L_2il0floatpacket.25(%rip), %xmm11
  266. movslq -32(%rsp), %rdi
  267. shlq $52, %rdi
  268. subsd (%rcx), %xmm5
  269. addq %rsi, %rdi
  270. subsd %xmm5, %xmm6
  271. movq %rdi, -16(%rsp)
  272. addsd %xmm6, %xmm7
  273. movaps %xmm7, %xmm8
  274. mulsd %xmm7, %xmm8
  275. mulsd %xmm8, %xmm12
  276. mulsd %xmm8, %xmm9
  277. addsd .L_2il0floatpacket.3(%rip), %xmm12
  278. addsd .L_2il0floatpacket.7(%rip), %xmm9
  279. mulsd %xmm8, %xmm12
  280. mulsd %xmm8, %xmm9
  281. addsd .L_2il0floatpacket.4(%rip), %xmm12
  282. addsd .L_2il0floatpacket.8(%rip), %xmm9
  283. mulsd %xmm8, %xmm12
  284. mulsd %xmm8, %xmm9
  285. addsd .L_2il0floatpacket.5(%rip), %xmm12
  286. addsd .L_2il0floatpacket.9(%rip), %xmm9
  287. mulsd %xmm7, %xmm12
  288. movsd -16(%rsp), %xmm10
  289. subsd %xmm12, %xmm9
  290. divsd %xmm9, %xmm12
  291. mulsd .L_2il0floatpacket.10(%rip), %xmm12
  292. shrl $31, %eax
  293. addsd %xmm11, %xmm12
  294. movl %eax, %r10d
  295. movsd %xmm12, -24(%rsp)
  296. xorl $1, %eax
  297. mulsd %xmm10, %xmm12
  298. movsd (%r8,%rax,8), %xmm0
  299. addsd %xmm11, %xmm12
  300. divsd %xmm12, %xmm0
  301. movsd %xmm7, -8(%rsp)
  302. addsd (%r9,%r10,8), %xmm0
  303. ret
  304. ..B1.15:
  305. lea ones(%rip), %rdx
  306. lea _small_value_64(%rip), %rcx
  307. shrl $31, %eax
  308. movsd (%rdx,%rax,8), %xmm0
  309. subsd (%rcx,%rax,8), %xmm0
  310. ret
  311. ..B1.16:
  312. andl $-2146435073, %ecx
  313. orl -8(%rsp), %ecx
  314. jne ..B1.18
  315. ..B1.17:
  316. lea ones(%rip), %rdx
  317. shrl $31, %eax
  318. movsd (%rdx,%rax,8), %xmm0
  319. ret
  320. ..B1.18:
  321. movsd -8(%rsp), %xmm0
  322. ..B1.19:
  323. ret
  324. .align 16,0x90
  325. .cfi_endproc
  326. .type tanh,@function
  327. .size tanh,.-tanh
  328. .data
  329. # -- End tanh
  330. .section .rodata, "a"
  331. .align 16
  332. .align 16
  333. .L_2il0floatpacket.24:
  334. .long 0xffffffff,0x7fffffff,0x00000000,0x00000000
  335. .type .L_2il0floatpacket.24,@object
  336. .size .L_2il0floatpacket.24,16
  337. .align 8
  338. .L_2il0floatpacket.0:
  339. .long 0x60000000,0x40071547
  340. .type .L_2il0floatpacket.0,@object
  341. .size .L_2il0floatpacket.0,8
  342. .align 8
  343. .L_2il0floatpacket.1:
  344. .long 0xf85ddf44,0x3e64ae0b
  345. .type .L_2il0floatpacket.1,@object
  346. .size .L_2il0floatpacket.1,8
  347. .align 8
  348. .L_2il0floatpacket.2:
  349. .long 0xfc0798c2,0x3fb3adff
  350. .type .L_2il0floatpacket.2,@object
  351. .size .L_2il0floatpacket.2,8
  352. .align 8
  353. .L_2il0floatpacket.3:
  354. .long 0x0e243699,0x406e3d92
  355. .type .L_2il0floatpacket.3,@object
  356. .size .L_2il0floatpacket.3,8
  357. .align 8
  358. .L_2il0floatpacket.4:
  359. .long 0xefbe8c98,0x40f689a6
  360. .type .L_2il0floatpacket.4,@object
  361. .size .L_2il0floatpacket.4,8
  362. .align 8
  363. .L_2il0floatpacket.5:
  364. .long 0x1ba09c9e,0x4156de47
  365. .type .L_2il0floatpacket.5,@object
  366. .size .L_2il0floatpacket.5,8
  367. .align 8
  368. .L_2il0floatpacket.6:
  369. .long 0xa57ad784,0x4018d7c2
  370. .type .L_2il0floatpacket.6,@object
  371. .size .L_2il0floatpacket.6,8
  372. .align 8
  373. .L_2il0floatpacket.7:
  374. .long 0x63529aa4,0x40b6b90b
  375. .type .L_2il0floatpacket.7,@object
  376. .size .L_2il0floatpacket.7,8
  377. .align 8
  378. .L_2il0floatpacket.8:
  379. .long 0xafc4d247,0x412d4376
  380. .type .L_2il0floatpacket.8,@object
  381. .size .L_2il0floatpacket.8,8
  382. .align 8
  383. .L_2il0floatpacket.9:
  384. .long 0x00000000,0x41707ef8
  385. .type .L_2il0floatpacket.9,@object
  386. .size .L_2il0floatpacket.9,8
  387. .align 8
  388. .L_2il0floatpacket.10:
  389. .long 0x00000000,0x40000000
  390. .type .L_2il0floatpacket.10,@object
  391. .size .L_2il0floatpacket.10,8
  392. .align 8
  393. .L_2il0floatpacket.11:
  394. .long 0x652b82fe,0x40671547
  395. .type .L_2il0floatpacket.11,@object
  396. .size .L_2il0floatpacket.11,8
  397. .align 8
  398. .L_2il0floatpacket.12:
  399. .long 0x00000000,0x3f762e42
  400. .type .L_2il0floatpacket.12,@object
  401. .size .L_2il0floatpacket.12,8
  402. .align 8
  403. .L_2il0floatpacket.13:
  404. .long 0x3de6af28,0x3e2fdf47
  405. .type .L_2il0floatpacket.13,@object
  406. .size .L_2il0floatpacket.13,8
  407. .align 8
  408. .L_2il0floatpacket.14:
  409. .long 0x00000000,0xbff00000
  410. .type .L_2il0floatpacket.14,@object
  411. .size .L_2il0floatpacket.14,8
  412. .align 8
  413. .L_2il0floatpacket.15:
  414. .long 0x6887cd7c,0x3f811111
  415. .type .L_2il0floatpacket.15,@object
  416. .size .L_2il0floatpacket.15,8
  417. .align 8
  418. .L_2il0floatpacket.16:
  419. .long 0x55555405,0x3fc55555
  420. .type .L_2il0floatpacket.16,@object
  421. .size .L_2il0floatpacket.16,8
  422. .align 8
  423. .L_2il0floatpacket.17:
  424. .long 0x87372663,0x3f56c16c
  425. .type .L_2il0floatpacket.17,@object
  426. .size .L_2il0floatpacket.17,8
  427. .align 8
  428. .L_2il0floatpacket.18:
  429. .long 0x5555541d,0x3fa55555
  430. .type .L_2il0floatpacket.18,@object
  431. .size .L_2il0floatpacket.18,8
  432. .align 8
  433. .L_2il0floatpacket.19:
  434. .long 0x00000000,0x3fe00000
  435. .type .L_2il0floatpacket.19,@object
  436. .size .L_2il0floatpacket.19,8
  437. .align 8
  438. .L_2il0floatpacket.20:
  439. .long 0xbe3b77e0,0x3f9662a1
  440. .type .L_2il0floatpacket.20,@object
  441. .size .L_2il0floatpacket.20,8
  442. .align 8
  443. .L_2il0floatpacket.21:
  444. .long 0x1110d2a4,0x3fc11111
  445. .type .L_2il0floatpacket.21,@object
  446. .size .L_2il0floatpacket.21,8
  447. .align 8
  448. .L_2il0floatpacket.22:
  449. .long 0x00fa2d61,0xbfaba1ba
  450. .type .L_2il0floatpacket.22,@object
  451. .size .L_2il0floatpacket.22,8
  452. .align 8
  453. .L_2il0floatpacket.23:
  454. .long 0x55555555,0xbfd55555
  455. .type .L_2il0floatpacket.23,@object
  456. .size .L_2il0floatpacket.23,8
  457. .align 8
  458. .L_2il0floatpacket.25:
  459. .long 0x00000000,0x3ff00000
  460. .type .L_2il0floatpacket.25,@object
  461. .size .L_2il0floatpacket.25,8
  462. .align 8
  463. ones:
  464. .long 0x00000000,0x3ff00000
  465. .long 0x00000000,0xbff00000
  466. .type ones,@object
  467. .size ones,16
  468. .align 8
  469. twos:
  470. .long 0x00000000,0x40000000
  471. .long 0x00000000,0xc0000000
  472. .type twos,@object
  473. .size twos,16
  474. .align 4
  475. _small_value_64:
  476. .long 0
  477. .long 24117248
  478. .long 0
  479. .long 2171600896
  480. .type _small_value_64,@object
  481. .size _small_value_64,16
  482. .align 4
  483. _TWO_52:
  484. .long 0
  485. .long 1127219200
  486. .type _TWO_52,@object
  487. .size _TWO_52,8
  488. .align 4
  489. _TWO_32:
  490. .long 0
  491. .long 1106247680
  492. .type _TWO_32,@object
  493. .size _TWO_32,8
  494. .data
  495. .section .note.GNU-stack, ""
  496. // -- Begin DWARF2 SEGMENT .eh_frame
  497. .section .eh_frame,"a",@progbits
  498. .eh_frame_seg:
  499. .align 1
  500. # End