fmaf_gen.S 17 KB


  1. /*
  2. * Math library
  3. *
  4. * Copyright (C) 2016 Intel Corporation. All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. *
  10. * * Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * * Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in
  14. * the documentation and/or other materials provided with the
  15. * distribution.
  16. * * Neither the name of Intel Corporation nor the names of its
  17. * contributors may be used to endorse or promote products derived
  18. * from this software without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. *
  32. *
  33. * Author Name <jingwei.zhang@intel.com>
  34. * History:
  35. * 03-14-2016 Initial version. numerics svn rev. 12864
  36. */
  37. .file "fmaf_gen.c"
  38. .text
  39. ..TXTST0:
  40. # -- Begin fmaf
  41. .text
  42. .align 16,0x90
  43. .globl fmaf
  44. fmaf:
  45. # parameter 1: %xmm0
  46. # parameter 2: %xmm1
  47. # parameter 3: %xmm2
  48. ..B1.1:
  49. .cfi_startproc
  50. ..___tag_value_fmaf.1:
  51. ..L2:
  52. pushq %r14
  53. .cfi_def_cfa_offset 16
  54. .cfi_offset 14, -16
  55. pushq %r15
  56. .cfi_def_cfa_offset 24
  57. .cfi_offset 15, -24
  58. pushq %rbx
  59. .cfi_def_cfa_offset 32
  60. .cfi_offset 3, -32
  61. pushq %rbp
  62. .cfi_def_cfa_offset 40
  63. .cfi_offset 6, -40
  64. subq $40, %rsp
  65. .cfi_def_cfa_offset 80
  66. movss %xmm0, 8(%rsp)
  67. movss %xmm1, 16(%rsp)
  68. movss %xmm2, 24(%rsp)
  69. ..B1.2:
  70. stmxcsr 12(%rsp)
  71. ..B1.3:
  72. movl 16(%rsp), %r8d
  73. movl %r8d, %ebp
  74. movl 24(%rsp), %r11d
  75. movl %r11d, %r10d
  76. movl 8(%rsp), %esi
  77. movl %esi, %ebx
  78. movzwl 12(%rsp), %edi
  79. andl $2147483647, %ebp
  80. andl $24576, %edi
  81. andl $2147483647, %r10d
  82. andl $2147483647, %ebx
  83. je ..B1.86
  84. ..B1.4:
  85. cmpl $2139095040, %ebx
  86. jae ..B1.86
  87. ..B1.5:
  88. cmpl $1065353216, %ebx
  89. je ..B1.86
  90. ..B1.6:
  91. testl %ebp, %ebp
  92. je ..B1.86
  93. ..B1.7:
  94. cmpl $2139095040, %ebp
  95. jae ..B1.86
  96. ..B1.8:
  97. cmpl $1065353216, %ebp
  98. je ..B1.86
  99. ..B1.9:
  100. testl %r10d, %r10d
  101. je ..B1.86
  102. ..B1.10:
  103. cmpl $2139095040, %r10d
  104. jae ..B1.86
  105. ..B1.11:
  106. cmpl $8388608, %ebx
  107. jae ..B1.13
  108. ..B1.12:
  109. movl %esi, %eax
  110. orl $1065353216, %esi
  111. movl %esi, 8(%rsp)
  112. lea _ones(%rip), %rsi
  113. shrl $31, %eax
  114. movss 8(%rsp), %xmm0
  115. subss (%rsi,%rax,4), %xmm0
  116. movd %xmm0, %esi
  117. movss %xmm0, 8(%rsp)
  118. movl %esi, %ebx
  119. andl $2147483647, %ebx
  120. movl %ebx, %ecx
  121. shrl $23, %ecx
  122. addl $-126, %ecx
  123. jmp ..B1.14
  124. ..B1.13:
  125. movl %ebx, %ecx
  126. shrl $23, %ecx
  127. ..B1.14:
  128. cmpl $8388608, %ebp
  129. jae ..B1.16
  130. ..B1.15:
  131. movl %r8d, %eax
  132. orl $1065353216, %r8d
  133. movl %r8d, 16(%rsp)
  134. lea _ones(%rip), %r8
  135. shrl $31, %eax
  136. movss 16(%rsp), %xmm0
  137. subss (%r8,%rax,4), %xmm0
  138. movd %xmm0, %r8d
  139. movss %xmm0, 16(%rsp)
  140. movl %r8d, %ebp
  141. andl $2147483647, %ebp
  142. movl %ebp, %edx
  143. shrl $23, %edx
  144. addl $-126, %edx
  145. jmp ..B1.17
  146. ..B1.16:
  147. movl %ebp, %edx
  148. shrl $23, %edx
  149. ..B1.17:
  150. cmpl $8388608, %r10d
  151. jae ..B1.19
  152. ..B1.18:
  153. movl %r11d, %eax
  154. orl $1065353216, %r11d
  155. movl %r11d, 24(%rsp)
  156. lea _ones(%rip), %r11
  157. shrl $31, %eax
  158. movss 24(%rsp), %xmm0
  159. subss (%r11,%rax,4), %xmm0
  160. movd %xmm0, %r11d
  161. movss %xmm0, 24(%rsp)
  162. movl %r11d, %r10d
  163. andl $2147483647, %r10d
  164. movl %r10d, %eax
  165. shrl $23, %eax
  166. addl $-126, %eax
  167. jmp ..B1.20
  168. ..B1.19:
  169. movl %r10d, %eax
  170. shrl $23, %eax
  171. ..B1.20:
  172. lea -127(%rcx,%rdx), %r9d
  173. andq $8388607, %rbx
  174. andq $8388607, %rbp
  175. orq $8388608, %rbx
  176. orq $8388608, %rbp
  177. andl $8388607, %r10d
  178. imulq %rbp, %rbx
  179. movq %rbx, %rdx
  180. xorl %r8d, %esi
  181. shrq $32, %rdx
  182. orl $8388608, %r10d
  183. andl $-2147483648, %esi
  184. movl %ebx, %ebp
  185. testl $32768, %edx
  186. je ..B1.22
  187. ..B1.21:
  188. shrq $24, %rbx
  189. incl %r9d
  190. shll $8, %ebp
  191. jmp ..B1.23
  192. ..B1.22:
  193. shrq $23, %rbx
  194. shll $9, %ebp
  195. ..B1.23:
  196. cmpl %eax, %r9d
  197. jg ..B1.26
  198. ..B1.24:
  199. jne ..B1.27
  200. ..B1.25:
  201. cmpl %r10d, %ebx
  202. jb ..B1.27
  203. ..B1.26:
  204. movl %r9d, %r8d
  205. xorl %edx, %edx
  206. subl %eax, %r8d
  207. movl %r10d, %eax
  208. movl %r11d, %r10d
  209. andl $-2147483648, %r10d
  210. jmp ..B1.28
  211. ..B1.27:
  212. movl %eax, %r8d
  213. movl %ebp, %edx
  214. subl %r9d, %r8d
  215. movl %eax, %r9d
  216. movl %ebx, %eax
  217. movl %r10d, %ebx
  218. movl %esi, %r10d
  219. movl %r11d, %esi
  220. xorl %ebp, %ebp
  221. andl $-2147483648, %esi
  222. ..B1.28:
  223. xorl %r14d, %r14d
  224. testl %r8d, %r8d
  225. je ..B1.95
  226. ..B1.29:
  227. cmpl $32, %r8d
  228. jge ..B1.31
  229. ..B1.30:
  230. movl %r8d, %r15d
  231. movl %r8d, %ecx
  232. negl %r15d
  233. movl $-1, %r11d
  234. shrl %cl, %r11d
  235. movl %r15d, %ecx
  236. movl %edx, %r14d
  237. notl %r11d
  238. shll %cl, %r14d
  239. movl %r15d, %ecx
  240. movl %eax, %r15d
  241. shll %cl, %r15d
  242. movl %r8d, %ecx
  243. shrl %cl, %edx
  244. andl %r11d, %r15d
  245. movl %r8d, %ecx
  246. orl %r15d, %edx
  247. shrl %cl, %eax
  248. movl $1, %r11d
  249. jmp ..B1.34
  250. ..B1.31:
  251. cmpl $64, %r8d
  252. jge ..B1.33
  253. ..B1.32:
  254. movl %r8d, %r14d
  255. movl %r8d, %ecx
  256. negl %r14d
  257. movl $-1, %r11d
  258. shrl %cl, %r11d
  259. movl %r14d, %ecx
  260. movl %eax, %r14d
  261. notl %r11d
  262. shll %cl, %r14d
  263. movl %r8d, %ecx
  264. andl %r11d, %r14d
  265. movl $1, %r11d
  266. testl %edx, %edx
  267. cmovne %r11d, %edx
  268. orl %edx, %r14d
  269. movl %eax, %edx
  270. shrl %cl, %edx
  271. xorl %eax, %eax
  272. jmp ..B1.34
  273. ..B1.33:
  274. movl $1, %r11d
  275. orl %eax, %edx
  276. cmovne %r11d, %r14d
  277. xorl %edx, %edx
  278. xorl %eax, %eax
  279. jmp ..B1.34
  280. ..B1.95:
  281. movl $1, %r11d
  282. ..B1.34:
  283. cmpl %r10d, %esi
  284. jne ..B1.38
  285. ..B1.35:
  286. addl %ebp, %edx
  287. addl %eax, %ebx
  288. cmpl %ebp, %edx
  289. movl $0, %ebp
  290. setb %bpl
  291. addl %ebp, %ebx
  292. testl $16777216, %ebx
  293. je ..B1.37
  294. ..B1.36:
  295. orl %r14d, %edx
  296. movl %ebx, %eax
  297. movl $0, %edx
  298. cmovne %r11d, %edx
  299. incl %r9d
  300. shll $31, %eax
  301. shrl $1, %ebx
  302. orl %edx, %eax
  303. andl $8388607, %ebx
  304. jmp ..B1.54
  305. ..B1.37:
  306. testl %r14d, %r14d
  307. movl %edx, %eax
  308. cmovne %r11d, %r14d
  309. andl $8388607, %ebx
  310. orl %r14d, %eax
  311. jmp ..B1.54
  312. ..B1.38:
  313. xorl %r8d, %r8d
  314. negl %r14d
  315. cmovne %r11d, %r8d
  316. negl %edx
  317. addl %ebp, %edx
  318. subl %eax, %ebx
  319. subl %r8d, %edx
  320. cmpl %ebp, %edx
  321. movl $0, %ebp
  322. seta %bpl
  323. subl %ebp, %ebx
  324. je ..B1.40
  325. ..B1.39:
  326. movl %ebx, %eax
  327. xorl %ebp, %ebp
  328. shll $8, %eax
  329. jmp ..B1.44
  330. ..B1.40:
  331. testl %edx, %edx
  332. je ..B1.42
  333. ..B1.41:
  334. movl %edx, %eax
  335. movl $24, %ebp
  336. jmp ..B1.44
  337. ..B1.42:
  338. testl %r14d, %r14d
  339. je ..B1.85
  340. ..B1.43:
  341. movl %r14d, %eax
  342. movl $56, %ebp
  343. ..B1.44:
  344. testl $-2147483648, %eax
  345. jne ..B1.48
  346. ..B1.46:
  347. addl %eax, %eax
  348. incl %ebp
  349. testl $-2147483648, %eax
  350. je ..B1.46
  351. ..B1.48:
  352. cmpl $32, %ebp
  353. jge ..B1.50
  354. ..B1.49:
  355. movl %ebp, %eax
  356. movl %ebp, %ecx
  357. negl %eax
  358. movl %edx, %r10d
  359. shll %cl, %ebx
  360. movl %eax, %ecx
  361. shrl %cl, %r10d
  362. movl %ebp, %ecx
  363. movl %r11d, %r8d
  364. testl %r14d, %r14d
  365. cmovne %r11d, %r14d
  366. shll %cl, %r8d
  367. movl %ebp, %ecx
  368. decl %r8d
  369. shll %cl, %edx
  370. andl %r8d, %r10d
  371. orl %r10d, %ebx
  372. movl %edx, %eax
  373. andl $8388607, %ebx
  374. orl %r14d, %eax
  375. jmp ..B1.53
  376. ..B1.50:
  377. cmpl $64, %ebp
  378. jge ..B1.52
  379. ..B1.51:
  380. movl %ebp, %eax
  381. movl %ebp, %ecx
  382. negl %eax
  383. movl %r14d, %r8d
  384. shll %cl, %edx
  385. movl %eax, %ecx
  386. shrl %cl, %r8d
  387. movl %ebp, %ecx
  388. movl %r11d, %ebx
  389. movl %r14d, %eax
  390. shll %cl, %ebx
  391. movl %ebp, %ecx
  392. decl %ebx
  393. andl %ebx, %r8d
  394. orl %r8d, %edx
  395. movl %edx, %ebx
  396. shll %cl, %eax
  397. andl $8388607, %ebx
  398. jmp ..B1.53
  399. ..B1.52:
  400. movl %ebp, %ecx
  401. xorl %eax, %eax
  402. shll %cl, %r14d
  403. movl %r14d, %ebx
  404. andl $8388607, %ebx
  405. ..B1.53:
  406. subl %ebp, %r9d
  407. ..B1.54:
  408. lea -1(%r9), %edx
  409. cmpl $254, %edx
  410. jb ..B1.60
  411. ..B1.55:
  412. cmpl $255, %r9d
  413. jge ..B1.90
  414. ..B1.56:
  415. negl %r9d
  416. incl %r9d
  417. cmpl $24, %r9d
  418. jg ..B1.58
  419. ..B1.57:
  420. movl %r9d, %edx
  421. movl %r9d, %ecx
  422. negl %edx
  423. movl $-1, %ebp
  424. orl $8388608, %ebx
  425. testl %eax, %eax
  426. movl %ebx, %r8d
  427. cmovne %r11d, %eax
  428. shrl %cl, %ebp
  429. movl %edx, %ecx
  430. shll %cl, %r8d
  431. notl %ebp
  432. andl %ebp, %r8d
  433. movl %r9d, %ecx
  434. shrl %cl, %ebx
  435. orl %r8d, %eax
  436. jmp ..B1.61
  437. ..B1.58:
  438. movl %r11d, %eax
  439. xorl %ebx, %ebx
  440. jmp ..B1.62
  441. ..B1.60:
  442. shll $23, %r9d
  443. orl %r9d, %ebx
  444. ..B1.61:
  445. testl %eax, %eax
  446. je ..B1.84
  447. ..B1.62:
  448. movl $1065353216, (%rsp)
  449. testl %edi, %edi
  450. je ..B1.87
  451. ..B1.63:
  452. cmpl $24576, %edi
  453. je ..B1.80
  454. ..B1.64:
  455. cmpl $16384, %edi
  456. jne ..B1.72
  457. ..B1.65:
  458. testl %esi, %esi
  459. je ..B1.67
  460. ..B1.66:
  461. cmpl $8388608, %ebx
  462. jb ..B1.70
  463. jmp ..B1.84
  464. ..B1.67:
  465. cmpl $8388607, %ebx
  466. jb ..B1.70
  467. ..B1.68:
  468. jne ..B1.71
  469. ..B1.69:
  470. cmpl $-2147483648, %eax
  471. jae ..B1.71
  472. ..B1.70:
  473. movss .L_2il0floatpacket.1(%rip), %xmm0
  474. testl %esi, %esi
  475. mulss %xmm0, %xmm0
  476. movss %xmm0, (%rsp)
  477. jne ..B1.84
  478. ..B1.71:
  479. incl %ebx
  480. cmpl $2139095040, %ebx
  481. jae ..B1.90
  482. jmp ..B1.84
  483. ..B1.72:
  484. cmpl $8192, %edi
  485. jne ..B1.84
  486. ..B1.73:
  487. testl %esi, %esi
  488. jne ..B1.75
  489. ..B1.74:
  490. cmpl $8388608, %ebx
  491. jb ..B1.78
  492. jmp ..B1.84
  493. ..B1.75:
  494. cmpl $8388607, %ebx
  495. jb ..B1.78
  496. ..B1.76:
  497. jne ..B1.79
  498. ..B1.77:
  499. cmpl $-2147483648, %eax
  500. jae ..B1.79
  501. ..B1.78:
  502. movss .L_2il0floatpacket.1(%rip), %xmm0
  503. testl %esi, %esi
  504. mulss %xmm0, %xmm0
  505. movss %xmm0, (%rsp)
  506. je ..B1.84
  507. ..B1.79:
  508. incl %ebx
  509. cmpl $2139095040, %ebx
  510. jae ..B1.90
  511. jmp ..B1.84
  512. ..B1.80:
  513. cmpl $8388608, %ebx
  514. jb ..B1.83
  515. ..B1.81:
  516. jne ..B1.84
  517. ..B1.82:
  518. cmpl $-2147483648, %eax
  519. jne ..B1.84
  520. ..B1.83:
  521. movss .L_2il0floatpacket.1(%rip), %xmm0
  522. mulss %xmm0, %xmm0
  523. movss %xmm0, (%rsp)
  524. ..B1.84:
  525. orl %ebx, %esi
  526. movl %esi, 4(%rsp)
  527. movss 4(%rsp), %xmm0
  528. addq $40, %rsp
  529. .cfi_def_cfa_offset 40
  530. .cfi_restore 6
  531. popq %rbp
  532. .cfi_def_cfa_offset 32
  533. .cfi_restore 3
  534. popq %rbx
  535. .cfi_def_cfa_offset 24
  536. .cfi_restore 15
  537. popq %r15
  538. .cfi_def_cfa_offset 16
  539. .cfi_restore 14
  540. popq %r14
  541. .cfi_def_cfa_offset 8
  542. ret
  543. .cfi_def_cfa_offset 80
  544. .cfi_offset 3, -32
  545. .cfi_offset 6, -40
  546. .cfi_offset 14, -16
  547. .cfi_offset 15, -24
  548. ..B1.85:
  549. xorl %edx, %edx
  550. cmpl $8192, %edi
  551. lea _zeros(%rip), %rcx
  552. sete %dl
  553. movss (%rcx,%rdx,4), %xmm0
  554. addq $40, %rsp
  555. .cfi_def_cfa_offset 40
  556. .cfi_restore 6
  557. popq %rbp
  558. .cfi_def_cfa_offset 32
  559. .cfi_restore 3
  560. popq %rbx
  561. .cfi_def_cfa_offset 24
  562. .cfi_restore 15
  563. popq %r15
  564. .cfi_def_cfa_offset 16
  565. .cfi_restore 14
  566. popq %r14
  567. .cfi_def_cfa_offset 8
  568. ret
  569. .cfi_def_cfa_offset 80
  570. .cfi_offset 3, -32
  571. .cfi_offset 6, -40
  572. .cfi_offset 14, -16
  573. .cfi_offset 15, -24
  574. ..B1.86:
  575. movss 8(%rsp), %xmm0
  576. mulss 16(%rsp), %xmm0
  577. addss 24(%rsp), %xmm0
  578. movss %xmm0, 4(%rsp)
  579. addq $40, %rsp
  580. .cfi_def_cfa_offset 40
  581. .cfi_restore 6
  582. popq %rbp
  583. .cfi_def_cfa_offset 32
  584. .cfi_restore 3
  585. popq %rbx
  586. .cfi_def_cfa_offset 24
  587. .cfi_restore 15
  588. popq %r15
  589. .cfi_def_cfa_offset 16
  590. .cfi_restore 14
  591. popq %r14
  592. .cfi_def_cfa_offset 8
  593. ret
  594. .cfi_def_cfa_offset 80
  595. .cfi_offset 3, -32
  596. .cfi_offset 6, -40
  597. .cfi_offset 14, -16
  598. .cfi_offset 15, -24
  599. ..B1.87:
  600. testl $-2147483648, %eax
  601. je ..B1.80
  602. ..B1.88:
  603. movl %ebx, %ecx
  604. movl %eax, %edx
  605. andl $1, %ecx
  606. andl $2147483647, %edx
  607. orl %edx, %ecx
  608. je ..B1.80
  609. ..B1.89:
  610. incl %ebx
  611. cmpl $2139095040, %ebx
  612. jb ..B1.80
  613. ..B1.90:
  614. lea _large_value_32(%rip), %rax
  615. shrl $31, %esi
  616. movss (%rax,%rsi,4), %xmm0
  617. mulss .L_2il0floatpacket.0(%rip), %xmm0
  618. movss %xmm0, (%rsp)
  619. movss %xmm0, 4(%rsp)
  620. addq $40, %rsp
  621. .cfi_def_cfa_offset 40
  622. .cfi_restore 6
  623. popq %rbp
  624. .cfi_def_cfa_offset 32
  625. .cfi_restore 3
  626. popq %rbx
  627. .cfi_def_cfa_offset 24
  628. .cfi_restore 15
  629. popq %r15
  630. .cfi_def_cfa_offset 16
  631. .cfi_restore 14
  632. popq %r14
  633. .cfi_def_cfa_offset 8
  634. ret
  635. .align 16,0x90
  636. .cfi_endproc
  637. .type fmaf,@function
  638. .size fmaf,.-fmaf
  639. .data
  640. # -- End fmaf
  641. .section .rodata, "a"
  642. .align 4
  643. .align 4
  644. .L_2il0floatpacket.0:
  645. .long 0x71800000
  646. .type .L_2il0floatpacket.0,@object
  647. .size .L_2il0floatpacket.0,4
  648. .align 4
  649. .L_2il0floatpacket.1:
  650. .long 0x0d800000
  651. .type .L_2il0floatpacket.1,@object
  652. .size .L_2il0floatpacket.1,4
  653. .align 4
  654. .L_2il0floatpacket.2:
  655. .long 0x3f800000
  656. .type .L_2il0floatpacket.2,@object
  657. .size .L_2il0floatpacket.2,4
  658. .align 4
  659. _ones:
  660. .long 1065353216
  661. .long 3212836864
  662. .type _ones,@object
  663. .size _ones,8
  664. .align 4
  665. _zeros:
  666. .long 0
  667. .long 2147483648
  668. .type _zeros,@object
  669. .size _zeros,8
  670. .align 4
  671. _large_value_32:
  672. .long 1904214016
  673. .long 4051697664
  674. .type _large_value_32,@object
  675. .size _large_value_32,8
  676. .data
  677. .section .note.GNU-stack, ""
  678. // -- Begin DWARF2 SEGMENT .eh_frame
  679. .section .eh_frame,"a",@progbits
  680. .eh_frame_seg:
  681. .align 1
  682. # End