csqrtf_wmt.S 9.9 KB


  1. /*
  2. * Math library
  3. *
  4. * Copyright (C) 2016 Intel Corporation. All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. *
  10. * * Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * * Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in
  14. * the documentation and/or other materials provided with the
  15. * distribution.
  16. * * Neither the name of Intel Corporation nor the names of its
  17. * contributors may be used to endorse or promote products derived
  18. * from this software without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. *
  32. *
  33. * Author Name <jingwei.zhang@intel.com>
  34. * History:
  35. * 03-14-2016 Initial version. numerics svn rev. 12864
  36. */
  37. .file "csqrtf_wmt.c"
  38. .text
  39. ..TXTST0:
  40. # -- Begin static_func
  41. .text
  42. .align 16,0x90
  43. static_func:
  44. ..B1.1:
  45. ..L1:
  46. call ..L2
  47. ..L2:
  48. popl %eax
  49. lea _GLOBAL_OFFSET_TABLE_+[. - ..L2](%eax), %eax
  50. lea static_const_table@GOTOFF(%eax), %eax
  51. ret
  52. .align 16,0x90
  53. .type static_func,@function
  54. .size static_func,.-static_func
  55. .data
  56. # -- End static_func
  57. .text
  58. # -- Begin csqrtf
  59. .text
  60. .align 16,0x90
  61. .globl csqrtf
  62. csqrtf:
  63. # parameter 1: 8 + %ebp
  64. ..B2.1:
  65. ..L3:
  66. ..B2.2:
  67. pushl %ebp
  68. movl %esp, %ebp
  69. subl $216, %esp
  70. movl %ebx, 160(%esp)
  71. call static_func
  72. movl %eax, %ebx
  73. movss 224(%esp), %xmm0
  74. movss 228(%esp), %xmm1
  75. movss %xmm0, (%esp)
  76. movss %xmm1, 4(%esp)
  77. movl (%esp), %eax
  78. movl 4(%esp), %ecx
  79. unpcklps %xmm1, %xmm0
  80. movl %ecx, %edx
  81. andl $2139095040, %eax
  82. andl $2139095040, %ecx
  83. subl $8388608, %eax
  84. subl $8388608, %ecx
  85. andl $2139095040, %eax
  86. andl $2139095040, %ecx
  87. subl $2130706432, %eax
  88. subl $2130706432, %ecx
  89. testl %ecx, %eax
  90. jns .L_2TAG_PACKET_0.0.2
  91. cvtps2pd %xmm0, %xmm0
  92. pxor %xmm4, %xmm4
  93. movl $16, %eax
  94. movapd %xmm0, %xmm1
  95. unpckhpd %xmm1, %xmm1
  96. movapd %xmm0, %xmm7
  97. mulsd %xmm0, %xmm0
  98. movapd %xmm7, %xmm6
  99. mulsd %xmm1, %xmm1
  100. pinsrw $3, %eax, %xmm4
  101. addsd %xmm1, %xmm0
  102. andpd (%ebx), %xmm7
  103. sqrtsd %xmm0, %xmm0
  104. addsd %xmm7, %xmm0
  105. psubd %xmm4, %xmm0
  106. movsd %xmm0, %xmm7
  107. movdqa %xmm0, %xmm1
  108. pand 16(%ebx), %xmm0
  109. movdqa %xmm1, %xmm2
  110. paddd 32(%ebx), %xmm0
  111. psrld $1, %xmm1
  112. psrlq $29, %xmm0
  113. pand 48(%ebx), %xmm1
  114. rsqrtss %xmm0, %xmm0
  115. psubd 64(%ebx), %xmm1
  116. psllq $29, %xmm0
  117. movapd 80(%ebx), %xmm3
  118. psubd %xmm1, %xmm0
  119. movapd 96(%ebx), %xmm1
  120. mulsd %xmm0, %xmm2
  121. movapd 48(%ebx), %xmm4
  122. mulsd %xmm0, %xmm2
  123. subsd %xmm4, %xmm2
  124. mulsd %xmm2, %xmm3
  125. addsd %xmm1, %xmm3
  126. mulsd %xmm2, %xmm3
  127. mulsd %xmm0, %xmm3
  128. addsd %xmm3, %xmm0
  129. mulpd 112(%ebx), %xmm7
  130. unpcklpd %xmm0, %xmm0
  131. .L_2TAG_PACKET_1.0.2:
  132. pextrw $3, %xmm6, %eax
  133. mulpd %xmm7, %xmm0
  134. andl $-2147483648, %edx
  135. cvtpd2ps %xmm0, %xmm1
  136. testl $32768, %eax
  137. pshufd $17, %xmm1, %xmm2
  138. je .L_2TAG_PACKET_2.0.2
  139. movd %xmm1, %ecx
  140. movd %xmm2, %eax
  141. orl %ecx, %edx
  142. testl $2139095040, %eax
  143. jmp .L_2TAG_PACKET_3.0.2
  144. .L_2TAG_PACKET_2.0.2:
  145. movd %xmm2, %ecx
  146. movd %xmm1, %eax
  147. orl %ecx, %edx
  148. testl $2139095040, %ecx
  149. .L_2TAG_PACKET_3.0.2:
  150. je .L_2TAG_PACKET_4.0.2
  151. jmp .L_2TAG_PACKET_5.0.2
  152. .L_2TAG_PACKET_4.0.2:
  153. testl $2147483647, 4(%esp)
  154. jne .L_2TAG_PACKET_6.0.2
  155. jmp .L_2TAG_PACKET_5.0.2
  156. .L_2TAG_PACKET_6.0.2:
  157. movhpd %xmm0, 112(%esp)
  158. testl $2139095040, %eax
  159. fldl 112(%esp)
  160. fstps 112(%esp)
  161. je .L_2TAG_PACKET_7.0.2
  162. andl $-2147483648, %edx
  163. orl 112(%esp), %edx
  164. jmp .L_2TAG_PACKET_8.0.2
  165. .L_2TAG_PACKET_7.0.2:
  166. movl 112(%esp), %eax
  167. .L_2TAG_PACKET_8.0.2:
  168. movl $8388608, 116(%esp)
  169. flds 112(%esp)
  170. fmul %st(0), %st
  171. fstps 112(%esp)
  172. jmp .L_2TAG_PACKET_5.0.2
  173. .L_2TAG_PACKET_0.0.2:
  174. movdqa %xmm0, %xmm2
  175. movdqa 128(%ebx), %xmm4
  176. pshufd $80, %xmm0, %xmm0
  177. pxor %xmm5, %xmm5
  178. movdqa %xmm2, %xmm3
  179. pand 144(%ebx), %xmm0
  180. pshufd $115, %xmm2, %xmm2
  181. pcmpeqd %xmm4, %xmm0
  182. movdqa %xmm2, %xmm6
  183. movmskps %xmm0, %eax
  184. pand %xmm2, %xmm4
  185. testl %eax, %eax
  186. jne .L_2TAG_PACKET_9.0.2
  187. pxor %xmm0, %xmm0
  188. .L_2TAG_PACKET_10.0.2:
  189. pand (%ebx), %xmm2
  190. pcmpeqd %xmm5, %xmm4
  191. movdqa %xmm4, %xmm3
  192. pand 160(%ebx), %xmm4
  193. psrlq $3, %xmm2
  194. pand 176(%ebx), %xmm3
  195. por %xmm4, %xmm2
  196. paddd 192(%ebx), %xmm3
  197. subpd %xmm4, %xmm2
  198. paddd %xmm3, %xmm2
  199. pandn %xmm2, %xmm0
  200. pxor %xmm4, %xmm4
  201. movl $16, %eax
  202. movapd %xmm0, %xmm1
  203. unpckhpd %xmm1, %xmm1
  204. movapd %xmm0, %xmm7
  205. mulsd %xmm0, %xmm0
  206. mulsd %xmm1, %xmm1
  207. pinsrw $3, %eax, %xmm4
  208. addsd %xmm1, %xmm0
  209. sqrtsd %xmm0, %xmm0
  210. addsd %xmm7, %xmm0
  211. psubd %xmm4, %xmm0
  212. movsd %xmm0, %xmm7
  213. sqrtsd %xmm0, %xmm1
  214. movapd 48(%ebx), %xmm0
  215. divsd %xmm1, %xmm0
  216. mulpd 112(%ebx), %xmm7
  217. unpcklpd %xmm0, %xmm0
  218. jmp .L_2TAG_PACKET_1.0.2
  219. .L_2TAG_PACKET_9.0.2:
  220. cmpl $5, %eax
  221. je .L_2TAG_PACKET_11.0.2
  222. testl $10, %eax
  223. jne .L_2TAG_PACKET_12.0.2
  224. pshufd $160, %xmm0, %xmm0
  225. jmp .L_2TAG_PACKET_10.0.2
  226. .L_2TAG_PACKET_11.0.2:
  227. xorl %eax, %eax
  228. andl $-2147483648, %edx
  229. jmp .L_2TAG_PACKET_5.0.2
  230. .L_2TAG_PACKET_12.0.2:
  231. movl %edx, %ecx
  232. andl $2147483647, %ecx
  233. movd %xmm3, %eax
  234. cmpl $2139095040, %ecx
  235. je .L_2TAG_PACKET_13.0.2
  236. ja .L_2TAG_PACKET_14.0.2
  237. andl $-2147483648, %edx
  238. cmpl $-8388608, %eax
  239. je .L_2TAG_PACKET_15.0.2
  240. cmpl $2139095040, %eax
  241. jne .L_2TAG_PACKET_16.0.2
  242. jmp .L_2TAG_PACKET_5.0.2
  243. .L_2TAG_PACKET_15.0.2:
  244. xorl %eax, %eax
  245. orl $2139095040, %edx
  246. jmp .L_2TAG_PACKET_5.0.2
  247. .L_2TAG_PACKET_14.0.2:
  248. cmpl $2139095040, %eax
  249. je .L_2TAG_PACKET_17.0.2
  250. cmpl $-8388608, %eax
  251. je .L_2TAG_PACKET_18.0.2
  252. .L_2TAG_PACKET_16.0.2:
  253. mulss %xmm3, %xmm1
  254. flds (%esp)
  255. flds 4(%esp)
  256. movd %xmm1, %eax
  257. movl %eax, %edx
  258. jmp .L_2TAG_PACKET_19.0.2
  259. .L_2TAG_PACKET_17.0.2:
  260. flds 4(%esp)
  261. flds 4(%esp)
  262. orl $4194304, %edx
  263. jmp .L_2TAG_PACKET_19.0.2
  264. .L_2TAG_PACKET_18.0.2:
  265. flds 4(%esp)
  266. flds 4(%esp)
  267. movl %edx, %eax
  268. movl $2139095040, %edx
  269. orl $4194304, %eax
  270. jmp .L_2TAG_PACKET_19.0.2
  271. .L_2TAG_PACKET_13.0.2:
  272. flds (%esp)
  273. fld1
  274. movl $2139095040, %eax
  275. .L_2TAG_PACKET_19.0.2:
  276. fmulp
  277. fstp %st(0)
  278. .L_2TAG_PACKET_5.0.2:
  279. movl 160(%esp), %ebx
  280. movl %ebp, %esp
  281. popl %ebp
  282. ret
  283. ..B2.3:
  284. .align 16,0x90
  285. .type csqrtf,@function
  286. .size csqrtf,.-csqrtf
  287. .data
  288. # -- End csqrtf
  289. .section .rodata, "a"
  290. .align 16
  291. .align 16
  292. static_const_table:
  293. .long 4294967295
  294. .long 2147483647
  295. .long 4294967295
  296. .long 2147483647
  297. .long 4294967295
  298. .long 2097151
  299. .long 4294967295
  300. .long 2097151
  301. .long 0
  302. .long 132120576
  303. .long 0
  304. .long 132120576
  305. .long 0
  306. .long 1072693248
  307. .long 0
  308. .long 1072693248
  309. .long 0
  310. .long 1475346432
  311. .long 0
  312. .long 1475346432
  313. .long 2148429837
  314. .long 1071120401
  315. .long 2148429837
  316. .long 1071120401
  317. .long 195330
  318. .long 3219128325
  319. .long 195330
  320. .long 3219128325
  321. .long 0
  322. .long 1072693248
  323. .long 0
  324. .long 1071644672
  325. .long 0
  326. .long 2139095040
  327. .long 0
  328. .long 2139095040
  329. .long 2147483647
  330. .long 2139095040
  331. .long 2147483647
  332. .long 2139095040
  333. .long 0
  334. .long 133169152
  335. .long 0
  336. .long 133169152
  337. .long 0
  338. .long 4162846720
  339. .long 0
  340. .long 4162846720
  341. .long 0
  342. .long 939524096
  343. .long 0
  344. .long 939524096
  345. .type static_const_table,@object
  346. .size static_const_table,208
  347. .data
  348. .section .note.GNU-stack, ""
  349. # End