fmaf_wmt.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /*
  2. * Math library
  3. *
  4. * Copyright (C) 2016 Intel Corporation. All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. *
  10. * * Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * * Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in
  14. * the documentation and/or other materials provided with the
  15. * distribution.
  16. * * Neither the name of Intel Corporation nor the names of its
  17. * contributors may be used to endorse or promote products derived
  18. * from this software without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. *
  32. *
  33. * Author Name <jingwei.zhang@intel.com>
  34. * History:
  35. * 03-14-2016 Initial version. numerics svn rev. 12864
  36. */
  37. .file "fmaf_wmt.c"
  38. .text
  39. ..TXTST0:
  40. # -- Begin static_func
  41. .text
  42. .align 16,0x90
  43. static_func:
  44. ..B1.1:
  45. ..L1:
  46. call ..L2
  47. ..L2:
  48. popl %eax
  49. lea _GLOBAL_OFFSET_TABLE_+[. - ..L2](%eax), %eax
  50. lea static_const_table@GOTOFF(%eax), %eax
  51. ret
  52. .align 16,0x90
  53. .type static_func,@function
  54. .size static_func,.-static_func
  55. .data
  56. # -- End static_func
  57. .text
  58. # -- Begin fmaf
  59. .text
  60. .align 16,0x90
  61. .globl fmaf
  62. fmaf:
  63. # parameter 1: 8 + %ebp
  64. # parameter 2: 12 + %ebp
  65. # parameter 3: 16 + %ebp
  66. ..B2.1:
  67. ..L3:
  68. ..B2.2:
  69. pushl %ebp
  70. movl %esp, %ebp
  71. subl $136, %esp
  72. movl %ebx, 80(%esp)
  73. call static_func
  74. movl %eax, %ebx
  75. movss 144(%esp), %xmm0
  76. movss 148(%esp), %xmm1
  77. movss 152(%esp), %xmm2
  78. movss %xmm0, 8(%esp)
  79. movss %xmm1, 16(%esp)
  80. movss %xmm2, 24(%esp)
  81. movl 8(%esp), %eax
  82. ucomiss %xmm1, %xmm0
  83. movl 16(%esp), %ecx
  84. jp .L_2TAG_PACKET_0.0.2
  85. movl 24(%esp), %edx
  86. ucomiss %xmm2, %xmm2
  87. jp .L_2TAG_PACKET_1.0.2
  88. andl $2147483647, %eax
  89. je .L_2TAG_PACKET_2.0.2
  90. cmpl $1065353216, %eax
  91. je .L_2TAG_PACKET_3.0.2
  92. cmpl $2139095040, %eax
  93. je .L_2TAG_PACKET_4.0.2
  94. andl $2147483647, %ecx
  95. je .L_2TAG_PACKET_2.0.2
  96. cmpl $1065353216, %ecx
  97. je .L_2TAG_PACKET_3.0.2
  98. cmpl $2139095040, %ecx
  99. je .L_2TAG_PACKET_4.0.2
  100. andl $2147483647, %edx
  101. je .L_2TAG_PACKET_5.0.2
  102. cmpl $2139095040, %edx
  103. je .L_2TAG_PACKET_6.0.2
  104. cmpl $8388608, %eax
  105. jl .L_2TAG_PACKET_7.0.2
  106. cvtps2pd %xmm0, %xmm3
  107. .L_2TAG_PACKET_8.0.2:
  108. cmpl $8388608, %ecx
  109. jl .L_2TAG_PACKET_9.0.2
  110. cvtps2pd %xmm1, %xmm4
  111. .L_2TAG_PACKET_10.0.2:
  112. cmpl $8388608, %edx
  113. jl .L_2TAG_PACKET_11.0.2
  114. cvtps2pd %xmm2, %xmm0
  115. .L_2TAG_PACKET_12.0.2:
  116. mulsd %xmm4, %xmm3
  117. pextrw $3, %xmm3, %edx
  118. andl $32752, %edx
  119. movl $96, %eax
  120. pextrw $3, %xmm0, %ecx
  121. andl $32752, %ecx
  122. addl %edx, %eax
  123. subl %ecx, %eax
  124. cmpl $560, %eax
  125. jae .L_2TAG_PACKET_13.0.2
  126. addsd %xmm3, %xmm0
  127. jmp .L_2TAG_PACKET_14.0.2
  128. .L_2TAG_PACKET_0.0.2:
  129. .L_2TAG_PACKET_2.0.2:
  130. .L_2TAG_PACKET_3.0.2:
  131. .L_2TAG_PACKET_4.0.2:
  132. .L_2TAG_PACKET_5.0.2:
  133. flds 8(%esp)
  134. fmuls 16(%esp)
  135. fadds 24(%esp)
  136. jmp .L_2TAG_PACKET_15.0.2
  137. .L_2TAG_PACKET_1.0.2:
  138. flds 8(%esp)
  139. fadds 24(%esp)
  140. jmp .L_2TAG_PACKET_15.0.2
  141. .L_2TAG_PACKET_6.0.2:
  142. flds 24(%esp)
  143. jmp .L_2TAG_PACKET_15.0.2
  144. .L_2TAG_PACKET_7.0.2:
  145. movaps (%ebx), %xmm3
  146. pand %xmm0, %xmm3
  147. movaps 48(%ebx), %xmm5
  148. orpd 16(%ebx), %xmm3
  149. pand %xmm0, %xmm5
  150. subsd 16(%ebx), %xmm3
  151. psllq $32, %xmm5
  152. mulsd 32(%ebx), %xmm3
  153. orpd %xmm5, %xmm3
  154. jmp .L_2TAG_PACKET_8.0.2
  155. .L_2TAG_PACKET_9.0.2:
  156. movaps (%ebx), %xmm4
  157. pand %xmm1, %xmm4
  158. movaps 48(%ebx), %xmm5
  159. orpd 16(%ebx), %xmm4
  160. pand %xmm1, %xmm5
  161. subsd 16(%ebx), %xmm4
  162. psllq $32, %xmm5
  163. mulsd 32(%ebx), %xmm4
  164. orpd %xmm5, %xmm4
  165. jmp .L_2TAG_PACKET_10.0.2
  166. .L_2TAG_PACKET_11.0.2:
  167. movaps (%ebx), %xmm0
  168. pand %xmm2, %xmm0
  169. movaps 48(%ebx), %xmm5
  170. orpd 16(%ebx), %xmm0
  171. pand %xmm2, %xmm5
  172. subsd 16(%ebx), %xmm0
  173. psllq $32, %xmm5
  174. mulsd 32(%ebx), %xmm0
  175. orpd %xmm5, %xmm0
  176. jmp .L_2TAG_PACKET_12.0.2
  177. .L_2TAG_PACKET_13.0.2:
  178. pextrw $1, %xmm2, %ecx
  179. pextrw $3, %xmm3, %edx
  180. sarl $4, %eax
  181. xorl %edx, %ecx
  182. testl $32768, %ecx
  183. jne .L_2TAG_PACKET_16.0.2
  184. cmpl $53, %eax
  185. jge .L_2TAG_PACKET_17.0.2
  186. cmpl $-19, %eax
  187. jle .L_2TAG_PACKET_18.0.2
  188. cmpl $6, %eax
  189. jge .L_2TAG_PACKET_19.0.2
  190. movl $6, %ecx
  191. subl %eax, %ecx
  192. addl $58, %eax
  193. movsd 64(%ebx), %xmm1
  194. pand 64(%ebx), %xmm3
  195. pxor %xmm5, %xmm5
  196. por 96(%ebx), %xmm3
  197. pxor %xmm2, %xmm2
  198. pinsrw $0, %eax, %xmm5
  199. pinsrw $0, %ecx, %xmm2
  200. pand %xmm0, %xmm1
  201. pand 80(%ebx), %xmm0
  202. movdqa %xmm3, %xmm4
  203. psllq %xmm5, %xmm3
  204. por 96(%ebx), %xmm1
  205. psrlq %xmm2, %xmm4
  206. psrlq $40, %xmm3
  207. paddq %xmm4, %xmm1
  208. movdqa %xmm1, %xmm5
  209. psrlq $53, %xmm1
  210. movdqa %xmm5, %xmm4
  211. psrlq %xmm1, %xmm5
  212. pand %xmm1, %xmm4
  213. psllq $52, %xmm1
  214. por %xmm3, %xmm5
  215. paddq %xmm1, %xmm0
  216. por %xmm4, %xmm5
  217. pand 64(%ebx), %xmm5
  218. por %xmm5, %xmm0
  219. jmp .L_2TAG_PACKET_14.0.2
  220. .L_2TAG_PACKET_17.0.2:
  221. movapd 112(%ebx), %xmm0
  222. orpd %xmm3, %xmm0
  223. jmp .L_2TAG_PACKET_14.0.2
  224. .L_2TAG_PACKET_18.0.2:
  225. orpd 112(%ebx), %xmm0
  226. jmp .L_2TAG_PACKET_14.0.2
  227. .L_2TAG_PACKET_19.0.2:
  228. movl $70, %ecx
  229. subl %eax, %ecx
  230. subl $6, %eax
  231. movsd 64(%ebx), %xmm1
  232. pand 64(%ebx), %xmm0
  233. pxor %xmm5, %xmm5
  234. por 96(%ebx), %xmm0
  235. pxor %xmm2, %xmm2
  236. pinsrw $0, %ecx, %xmm5
  237. pinsrw $0, %eax, %xmm2
  238. pand %xmm3, %xmm1
  239. pand 80(%ebx), %xmm3
  240. movdqa %xmm0, %xmm4
  241. psllq %xmm5, %xmm0
  242. por 96(%ebx), %xmm1
  243. psrlq %xmm2, %xmm4
  244. pxor %xmm2, %xmm2
  245. psrlq $18, %xmm0
  246. psubq %xmm0, %xmm2
  247. paddq %xmm4, %xmm1
  248. psrlq $63, %xmm2
  249. movdqa %xmm1, %xmm0
  250. psrlq $53, %xmm1
  251. movdqa %xmm0, %xmm4
  252. psrlq %xmm1, %xmm0
  253. pand %xmm1, %xmm4
  254. psllq $52, %xmm1
  255. por %xmm2, %xmm0
  256. paddq %xmm1, %xmm3
  257. por %xmm4, %xmm0
  258. pand 64(%ebx), %xmm0
  259. por %xmm3, %xmm0
  260. jmp .L_2TAG_PACKET_14.0.2
  261. .L_2TAG_PACKET_16.0.2:
  262. cmpl $53, %eax
  263. jge .L_2TAG_PACKET_20.0.2
  264. cmpl $-22, %eax
  265. jle .L_2TAG_PACKET_21.0.2
  266. cmpl $6, %eax
  267. jge .L_2TAG_PACKET_22.0.2
  268. movl $6, %ecx
  269. subl %eax, %ecx
  270. addl $58, %eax
  271. movsd 64(%ebx), %xmm1
  272. pand 64(%ebx), %xmm3
  273. pxor %xmm5, %xmm5
  274. por 96(%ebx), %xmm3
  275. pxor %xmm2, %xmm2
  276. pinsrw $0, %eax, %xmm5
  277. pinsrw $0, %ecx, %xmm2
  278. pand %xmm0, %xmm1
  279. pand 80(%ebx), %xmm0
  280. movdqa %xmm3, %xmm4
  281. psllq %xmm5, %xmm3
  282. por 96(%ebx), %xmm1
  283. psrlq %xmm2, %xmm4
  284. pxor %xmm2, %xmm2
  285. psrlq $37, %xmm3
  286. psubq %xmm3, %xmm2
  287. psubq %xmm4, %xmm1
  288. psrlq $63, %xmm2
  289. psubq %xmm2, %xmm1
  290. movdqa %xmm1, %xmm3
  291. movsd 112(%ebx), %xmm2
  292. psrlq $52, %xmm1
  293. psubq %xmm1, %xmm2
  294. movdqa %xmm2, %xmm1
  295. psllq $52, %xmm2
  296. psllq %xmm1, %xmm3
  297. pand 64(%ebx), %xmm3
  298. psubq %xmm2, %xmm0
  299. por %xmm3, %xmm0
  300. jmp .L_2TAG_PACKET_14.0.2
  301. .L_2TAG_PACKET_20.0.2:
  302. movsd 64(%ebx), %xmm1
  303. pand %xmm3, %xmm1
  304. por 96(%ebx), %xmm1
  305. psubq 112(%ebx), %xmm1
  306. movapd %xmm1, %xmm0
  307. psrlq $52, %xmm1
  308. movapd 112(%ebx), %xmm4
  309. psubq %xmm1, %xmm4
  310. psllq %xmm4, %xmm0
  311. psllq $52, %xmm4
  312. pand 80(%ebx), %xmm3
  313. psubq %xmm4, %xmm3
  314. pand 64(%ebx), %xmm0
  315. por %xmm3, %xmm0
  316. jmp .L_2TAG_PACKET_14.0.2
  317. .L_2TAG_PACKET_21.0.2:
  318. movsd 64(%ebx), %xmm1
  319. pand %xmm0, %xmm1
  320. por 96(%ebx), %xmm1
  321. psubq 112(%ebx), %xmm1
  322. movapd %xmm1, %xmm2
  323. psrlq $52, %xmm1
  324. movapd 112(%ebx), %xmm3
  325. psubq %xmm1, %xmm3
  326. psllq %xmm3, %xmm2
  327. psllq $52, %xmm3
  328. pand 80(%ebx), %xmm0
  329. psubq %xmm3, %xmm0
  330. pand 64(%ebx), %xmm2
  331. por %xmm2, %xmm0
  332. jmp .L_2TAG_PACKET_14.0.2
  333. .L_2TAG_PACKET_22.0.2:
  334. movl $70, %ecx
  335. subl %eax, %ecx
  336. subl $6, %eax
  337. movsd 64(%ebx), %xmm1
  338. pand 64(%ebx), %xmm0
  339. pxor %xmm5, %xmm5
  340. por 96(%ebx), %xmm0
  341. pxor %xmm2, %xmm2
  342. pinsrw $0, %ecx, %xmm5
  343. pinsrw $0, %eax, %xmm2
  344. pand %xmm3, %xmm1
  345. pand 80(%ebx), %xmm3
  346. movdqa %xmm0, %xmm4
  347. psllq %xmm5, %xmm0
  348. por 96(%ebx), %xmm1
  349. psrlq %xmm2, %xmm4
  350. pxor %xmm2, %xmm2
  351. psrlq $18, %xmm0
  352. psubq %xmm0, %xmm2
  353. psubq %xmm4, %xmm1
  354. psrlq $63, %xmm2
  355. psubq %xmm2, %xmm1
  356. movdqa %xmm1, %xmm0
  357. movsd 112(%ebx), %xmm2
  358. psrlq $52, %xmm1
  359. psubq %xmm1, %xmm2
  360. movdqa %xmm2, %xmm1
  361. psllq $52, %xmm2
  362. psllq %xmm1, %xmm0
  363. pand 64(%ebx), %xmm0
  364. psubq %xmm2, %xmm3
  365. por %xmm3, %xmm0
  366. jmp .L_2TAG_PACKET_14.0.2
  367. .L_2TAG_PACKET_14.0.2:
  368. movq %xmm0, (%esp)
  369. fldl (%esp)
  370. fstps 32(%esp)
  371. flds 32(%esp)
  372. .L_2TAG_PACKET_15.0.2:
  373. movl 80(%esp), %ebx
  374. movl %ebp, %esp
  375. popl %ebp
  376. ret
  377. ..B2.3:
  378. .align 16,0x90
  379. .type fmaf,@function
  380. .size fmaf,.-fmaf
  381. .data
  382. # -- End fmaf
  383. .section .rodata, "a"
  384. .align 16
  385. .align 16
  386. static_const_table:
  387. .long 2147483647
  388. .long 0
  389. .long 0
  390. .long 0
  391. .long 0
  392. .long 1072693248
  393. .long 0
  394. .long 0
  395. .long 0
  396. .long 970981376
  397. .long 0
  398. .long 0
  399. .long 2147483648
  400. .long 0
  401. .long 0
  402. .long 0
  403. .long 4294967295
  404. .long 1048575
  405. .long 0
  406. .long 0
  407. .long 0
  408. .long 4293918720
  409. .long 0
  410. .long 0
  411. .long 0
  412. .long 1048576
  413. .long 0
  414. .long 0
  415. .long 1
  416. .long 0
  417. .long 0
  418. .long 0
  419. .type static_const_table,@object
  420. .size static_const_table,128
  421. .data
  422. .section .note.GNU-stack, ""
  423. # End