#include "sgx_arch.h"
#include "asm-offsets.h"

# In some cases, like bogus parameters passed to enclave_entry, it's tricky to
# return cleanly (passing the correct return address to EEXIT, OCALL_EXIT can
# be interrupted, etc.). Since those cases should only ever happen with a
# malicious urts, just go into an endless loop.
.macro FAIL_LOOP
.Lfail_loop\@:
	jmp .Lfail_loop\@
.endm

.macro CHECK_IF_SIGNAL_STACK_IS_USED stack_reg, label_on_stack, label_out_of_stack
	cmpq %gs:SGX_SIG_STACK_LOW, \stack_reg
	jb \label_out_of_stack
	cmpq %gs:SGX_SIG_STACK_HIGH, \stack_reg
	ja \label_out_of_stack
	jmp \label_on_stack
.endm

	.global enclave_entry
	.type enclave_entry, @function

enclave_entry:
	# On EENTER, RAX is the current SSA index (aka CSSA), RBX is the address of
	# TCS, RCX is the address of IP following EENTER. Other regs are not trusted.

	# x86-64 sysv abi requires %rFLAGS.DF = 0 on entry to function call.
	cld

	cmpq $0, %rax
	jne .Lprepare_resume

	# ECALL return address in RCX (filled by EENTER hardware flow)
	movq %rcx, %gs:SGX_ECALL_RETURN_ADDR

	# The following code is hardened to defend attacks from untrusted host.
	# Any states given by the host instead of the ISA must be assumed
	# potentially malicious.
	#
	# For instance, Jo Van Bulck contributed a detailed vulnerability report
	# in https://github.com/oscarlab/graphene/issues/28. (Fixed)
	# Brief description of the vulnerabilities:
	# The previous implementation does not check the index of entry
	# functions (RDI at enclave entry) given by the untrusted PAL.
	# An attacker can cause overflow/underflow to jump to random
	# locaion in enclaves. Moreover, we used a specific index
	# (RETURN_FROM_OCALL) to tell if the control flow is returned
	# from a OCALL in the untrusted PAL. Attackers can manipulate RDI
	# to deceive the trusted PAL.

	# This thread can be interrupted but then the above check branches to
	# .Lprepare_resume. So the outside can't re-enter the checks below in
	# the middle.

	# Only jump to .Lreturn_from_ocall if we have prepared the stack for
	# it.
	cmpq $0, %gs:SGX_PRE_OCALL_STACK
	jne .Lreturn_from_ocall

	# PAL convention:
	# RDI - ECALL number
	# RSI - pointer to ecall arguments
	# RDX - exit target
	# RCX (former RSP) - The untrusted stack
	# R8  - enclave base

	cmpq $ECALL_THREAD_RESET, %rdi
	je .Lhandle_thread_reset

	# Except ecall_thread_reset, ecalls are only used to start a thread (main
	# or additional threads). We already checked for case of ecall_thread_reset,
	# so at this point we should only get exactly one ecall per thread
	cmpq $0, %gs:SGX_THREAD_STARTED
	je 1f
	FAIL_LOOP
1:
	movq $1, %gs:SGX_THREAD_STARTED

	# calculate enclave base = RBX (trusted) - %gs:SGX_TCS_OFFSET
	subq %gs:SGX_TCS_OFFSET, %rbx
	movq %rbx, %r8

	# push untrusted stack address to RCX
	movq %rsp, %rcx

	# switch to enclave stack: enclave base + %gs:SGX_INITIAL_STACK_OFFSET
	addq %gs:SGX_INITIAL_STACK_OFFSET, %rbx
	movq %rbx, %rsp

	# clear the rest of register states
	xorq %rax, %rax
	xorq %rbx, %rbx
	xorq %r9,  %r9
	xorq %r10, %r10
	xorq %r11, %r11
	xorq %r12, %r12
	xorq %r13, %r13
	xorq %r14, %r14
	xorq %r15, %r15

	# clear the Alignment Check flag (%rFLAGS.AC) to prevent #AC-fault side channel;
	# this overrides 8B on enclave stack but stack is not used at this point anyway
	pushfq
	andq $(~RFLAGS_AC), (%rsp)
	popfq

	# Clear "extended" state (FPU aka x87, SSE, AVX, ...).
	# TODO: We currently clear only state covered by FXRSTOR but not by XRSTOR
	#       (e.g., no clearing of YMM/ZMM regs). This is because we didn't read
	#       the value of XFRM yet, so we don't know whether XRSTOR is safe at
	#       this point.
	leaq xsave_reset_state(%rip), %rax
	fxrstor (%rax)
	xorq %rax, %rax

	# register states need to be carefully checked, so we move the handling
	# to handle_ecall() in enclave_ecalls.c
	callq handle_ecall

	# handle_ecall will only return when invalid parameters has been passed.
	FAIL_LOOP

	# clear TLS variables for thread reuse
.Lhandle_thread_reset:
	movq $0, %gs:SGX_READY_FOR_EXCEPTIONS

	# Assertion: thread is reset only after special-case OCALL_EXIT.
	cmpq $0, %gs:SGX_OCALL_EXIT_CALLED
	jne 1f
	FAIL_LOOP
1:

	# At this point, the thread has completely exited from the point of view
	# of LibOS. We can now set *clear_child_tid to 0, which will trigger
	# async helper thread in LibOS, who will wake up parent thread if any.
	cmpq $0, %gs:SGX_CLEAR_CHILD_TID
	je 1f
	movq %gs:SGX_CLEAR_CHILD_TID, %rbx
	movl $0, (%rbx)

1:
	# Signals are impossible at this point: benign untrusted runtime blocks
	# all signals (see sgx_ocall_exit()), and even if malicious one doesn't
	# block them, signals are ignored due to SGX_READY_FOR_EXCEPTIONS = 0.
	movq $0, %gs:SGX_THREAD_STARTED
	movq $0, %gs:SGX_OCALL_EXIT_CALLED
	movq $0, %gs:SGX_PRE_OCALL_STACK

	# Instead of jumping to .Lclear_and_eexit, simply perform EEXIT because
	# there is no modified state to clear in this "thread-reset" code path.
	movq %gs:SGX_ECALL_RETURN_ADDR, %rbx
	movq $EEXIT, %rax
	ENCLU

.Lprepare_resume:
	# PAL convention:
	# RDI - external event

	# Nested exceptions at the host-OS level are disallowed:
	# - Synchronous exceptions are assumed to never happen during
	#   prepare_resume;
	# - Asynchronous signals are not nested by benign host OS because
	#   we mask asynchronous signals on signal handler.
	# If malicious host OS injects a nested signal, CSSA != 1 and we go
	# into FAIL_LOOP. Currently this check is assertion only because it
	# is also enforced by EENTER since enclave is created with NSSA=2.
	cmpq $1, %rax
	je 1f
	FAIL_LOOP
1:

	movq %gs:SGX_GPR, %rbx

	movq %rdi, %rsi
	xorq %rdi, %rdi
	movl SGX_GPR_EXITINFO(%rbx), %edi
	testl $0x80000000, %edi
	jnz .Lhandle_exception

	movl %esi, %edi
	# use external event - only the first 8 bits count
	andl $0xff, %edi
	cmpl $0, %edi
	jne .Lhandle_exception

.Lignore_exception:
	# clear the registers
	xorq %rdi, %rdi
	xorq %rsi, %rsi

	# exit address in RDX, mov it to RBX
	movq %rdx, %rbx

	jmp .Lclear_and_eexit

.Lhandle_exception:
	# If this enclave thread has not been initialized yet, we should not
	# try to call an event handler yet.
	cmpq $0, %gs:SGX_READY_FOR_EXCEPTIONS
	jne 1f
	FAIL_LOOP
1:

	# Beware of races between host signal delivery and handling %rsp in
	# this entry code. Consider the following scenario:
	#
	# 1. We are inside the enclave but %rsp isn't restored yet to something
	#    inside the enclave. That's for example the case when returning from
	#    an ocall.
	# 2. The enclave gets interrupted. The not restored %rsp is pushed into
	#    SGX_GPR_RSP by the processor.
	# 3. The host enters the enclave again and indicates that there's a new
	#    signal.
	# 4. SGX_GPR_RSP points to the untrusted stack
	#
	# The below code should be fine since it detects an interrupted ocall
	# and restores %rsp from SGX_PRE_OCALL_STACK before exception handling
	# (see below for full details)

	# The stack swap logic does not need to be atomic because nested
	# exceptions are disallowed by SGX due to TCS.NSSA == 2 (thus,
	# .Lhandle_exception logic cannot be nested)

	# Check if we got interrupted during an ocall case (except OCALL_EXIT),
	# i.e. SGX_PRE_OCALL_STACK is set.
	movq %gs:SGX_PRE_OCALL_STACK, %rsi
	cmpq $0, %rsi
	jne .Lhandle_interrupted_ocall

	# If this is not the case check if OCALL_EXIT has been called. If this
	# is not the case setup the exception handler for the non-ocall case.
	cmpq $0, %gs:SGX_OCALL_EXIT_CALLED
	je .Lsetup_exception_handler

	# We are interrupted during the never-returning OCALL_EXIT. Because the
	# thread is going to exit anyway, we can ignore this exception.
	jmp .Lignore_exception

.Lhandle_interrupted_ocall:
	# At this point, we are in the exception handler and
	# SGX_PRE_OCALL_STACK=<trusted pointer to enclave stack>. I.e. we are
	# interrupted during handling of enclave's sgx_ocall/return_from_ocall
	# assembly code.
	#
	# Triggering the exception handler while SGX_PRE_OCALL_STACK != 0 would
	# be problematic because it could itself issue nested ocalls. This
	# would mean the SGX_PRE_OCALL_STACK logic would need to handle
	# nesting.
	#
	# Instead if we're in such situation, we emulate it as if %rip reached to
	# the safe point, .Lreturn_from_ocall_after_stack_restore.
	#
	# Ocall sequence:
	#  1. call sgx_ocall()
	#  2. SGX_PRE_OCALL_STACK=%rsp: save trusted stack
	#  3. EEXIT
	#  4. untrusted PAL which issues real host system call
	#  5. EENTER (and start from enclave_entry)
	#  6. .Lreturn_from_ocall:
	#  7. (%rsp, SGX_STACK) = (SGX_STACK, 0): restore trusted stack
	#  8. .Lreturn_from_ocall_after_stack_restore:
	#  9. call _DkHandleExternalEvent() if interrupted
	# 10. return from sgx_ocall() to the caller
	#
	# It is also required that sgx_ocall() be atomic regarding to async exception.
	# When host async signal arrives, sgx_ocall() should result in EINTR.
	#
	# There are three possibilities when exactly host async signal arrives:
	# A. before exiting enclave to perform host syscall
	# B. after exiting enclave and before re-entering enclave
	#    (i.e., during untrusted execution of host syscall)
	# C. after re-entering enclave but before returning to sgx_ocall().
	#
	# Note that Case A didn't even issue host syscall, Case B may have
	# interrupted host syscall (but maybe interrupt came after successful
	# host syscall), and Case C was interrupted after successful host
	# syscall. In Case C, the result of host system call must be preserved
	# to be replayed in later invocation.
	#
	# On host async signal we treat these cases as follows:
	# A. right-before EEXIT (2. in above sequence, before 2. got executed
	# 			 we don't land here):
	#	 - set EINTR and forward %rip to exception handler
	# B. during untrusted PAL (3. - 4. in above sequence):
	#	 - code in _DkTerminateSighandler() must handle this case
	#	 TODO: fix _DkTerminateSighandler() to not lose the result of successful
	#		   system call.
	# C. right-after EENTER (5. - 7. in above sequence):
	#	 - ocall succeeded, forward %rip to exception handler

	# Find out which of cases A, B, or C happened:
	# - copy rip at which the enclave was interrupted into %rax,
	# - copy the boundaries between cases A, B, and C into %r11,
	# - compare enclave's rip against these boundaries (%rax vs %r11).
	movq SGX_GPR_RIP(%rbx), %rax
	leaq .Locall_about_to_eexit_begin(%rip), %r11
	cmpq %r11, %rax
	jb .Lhandle_interrupted_ocall_case_c
	leaq .Locall_about_to_eexit_end(%rip), %r11
	cmpq %r11, %rax
	jae .Lhandle_interrupted_ocall_case_c

	# Case A. We are right-before EEXIT for ocall in between
	# [.Locall_about_to_eexit_begin, .Locall_about_to_eexit_end)
	# Skip EEXIT as if ocall returned EINTR.
	# If there is registered signal handler for the current exception,
	# _DkHandleExternalEvent() will be called (and thus we need to save
	# %rdi = <external event>) before returning from ocall.
	movq $-EINTR, SGX_GPR_RDI(%rbx) # return value for .Lreturn_from_ocall
	# fallthrough to Case C.

	# This code cannot land in Case B because:
	# (1) this code path (.Lhandle_exception) is triggered only if we haven't
	# yet exited the enclave when signal arrived, and
	# (2) in Case B, we exited the enclave and signal arrived while in
	# untrusted code. The two conditions cannot be true at the same time,
	# so Case B never happens here (Case B results in return_from_ocall code
	# path below).

.Lhandle_interrupted_ocall_case_c:
	# Case C. We are right-after EENTER returning from successful ocall.
	# Move %rip to .Lreturn_from_ocall_after_stack_restore and let
	# _DkHandleExternalEvent() handle the exception.
	# SGX_GPR_RDI(%rbx): don't touch successful ocall result.
	movq %rdi, SGX_GPR_RSI(%rbx) # external event for .Lreturn_from_ocall
	leaq .Lreturn_from_ocall_after_stack_restore(%rip), %rax
	movq %rax, SGX_GPR_RIP(%rbx)
	movq %rsi, SGX_GPR_RSP(%rbx)
	movq $0, %gs:SGX_PRE_OCALL_STACK
	andq $(~(RFLAGS_DF | RFLAGS_AC)), SGX_GPR_RFLAGS(%rbx)
	jmp .Leexit_exception

.Lsetup_exception_handler:
	# The thread got interrupted outside of ocall handling (see above for
	# that special case). We inject a call to _DkExceptionHandler into the
	# interrupted thread which will handle the exception on ERESUME.

	# The last instructions of _restore_sgx_context need to be atomic for
	# the code below (see _restore_sgx_context for more details). So
	# emulate this if we were interrupted there.
	leaq .Ltmp_rip_saved0(%rip), %rax
	cmpq %rax, SGX_GPR_RIP(%rbx)
	je .Lemulate_tmp_rip_saved0

	leaq .Ltmp_rip_saved1(%rip), %rax
	cmpq %rax, SGX_GPR_RIP(%rbx)
	je .Lemulate_tmp_rip_saved1

	leaq .Ltmp_rip_saved2(%rip), %rax
	cmpq %rax, SGX_GPR_RIP(%rbx)
	je .Lemulate_tmp_rip_saved2

	jmp .Lemulate_tmp_rip_end

.Lemulate_tmp_rip_saved0:
	# emulate movq SGX_CPU_CONTEXT_R15 - SGX_CPU_CONTEXT_RIP(%rsp), %r15
	movq SGX_GPR_RSP(%rbx), %rax
	movq SGX_CPU_CONTEXT_R15 - SGX_CPU_CONTEXT_RIP(%rax), %rax
	movq %rax, SGX_GPR_R15(%rbx)
.Lemulate_tmp_rip_saved1:
	# emulate movq SGX_CPU_CONTEXT_RSP - SGX_CPU_CONTEXT_RIP(%rsp), %rsp
	movq SGX_GPR_RSP(%rbx), %rax
	movq SGX_CPU_CONTEXT_RSP - SGX_CPU_CONTEXT_RIP(%rax), %rax
	movq %rax, SGX_GPR_RSP(%rbx)
.Lemulate_tmp_rip_saved2:
	# emulate jmp *%gs:SGX_TMP_RIP
	movq %gs:SGX_TMP_RIP, %rax
	movq %rax, SGX_GPR_RIP(%rbx)
.Lemulate_tmp_rip_end:

	movq SGX_GPR_RSP(%rbx), %rsi

	CHECK_IF_SIGNAL_STACK_IS_USED %rsi, .Lon_signal_stack, .Lout_of_signal_stack

.Lout_of_signal_stack:
	movq %gs:SGX_SIG_STACK_HIGH, %rsi
	# When switching to the not yet used signal stack we don't need to reserve
	# a redzone. So move the stack pointer up here to undo the move down below.
	addq $RED_ZONE_SIZE, %rsi

	# Setup stack for the signal handler, _DkExceptionHandler().
	# _restore_sgx_context() must be used to return back to the
	# original context.
	# Stack layout:
	#     8-bytes padding: (8 mod 16) bytes aligned for x86 ABI
	#                      NOTE: there is no saved rip to return.
	#     sgx_cpu_context_t: 144 bytes
	#     xsave area: PAL_XSTATE_ALIGN=64 bytes aligned
	#     padding if necessary
	#     RED_ZONE unless newly switching to signal stack
#define STACK_PADDING_SIZE (PAL_FP_XSTATE_MAGIC2_SIZE + 8)
#define STACK_FRAME_SUB \
	(SGX_CPU_CONTEXT_SIZE + RED_ZONE_SIZE + STACK_PADDING_SIZE)
.Lon_signal_stack:
	movl xsave_size(%rip), %eax
	addq $STACK_FRAME_SUB, %rax
	subq %rax, %rsi

	# Align xsave area to 64 bytes after sgx_cpu_context_t
	andq $~(PAL_XSTATE_ALIGN - 1), %rsi
	subq $SGX_CPU_CONTEXT_XSTATE_ALIGN_SUB, %rsi

	# we have exitinfo in RDI, swap with the one on GPR
	# and dump into the context
	xchgq %rdi, SGX_GPR_RDI(%rbx) # 1st argument for _DkExceptionHandler()
	movq %rdi, SGX_CPU_CONTEXT_RDI(%rsi)

	# dump the rest of context
	movq SGX_GPR_RAX(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RAX(%rsi)
	movq SGX_GPR_RCX(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RCX(%rsi)
	movq SGX_GPR_RDX(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RDX(%rsi)
	movq SGX_GPR_RBX(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RBX(%rsi)
	movq SGX_GPR_RSP(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RSP(%rsi)
	movq SGX_GPR_RBP(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RBP(%rsi)
	movq SGX_GPR_RSI(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RSI(%rsi)
	/* rdi is saved above */
	movq SGX_GPR_R8(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R8(%rsi)
	movq SGX_GPR_R9(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R9(%rsi)
	movq SGX_GPR_R10(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R10(%rsi)
	movq SGX_GPR_R11(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R11(%rsi)
	movq SGX_GPR_R12(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R12(%rsi)
	movq SGX_GPR_R13(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R13(%rsi)
	movq SGX_GPR_R14(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R14(%rsi)
	movq SGX_GPR_R15(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_R15(%rsi)
	movq SGX_GPR_RFLAGS(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RFLAGS(%rsi)
	movq SGX_GPR_RIP(%rbx), %rdi
	movq %rdi, SGX_CPU_CONTEXT_RIP(%rsi)

	# Pass pointer to sgx_cpu_context_t and PAL_XREGS_STATE to _DkExceptionHandler
	movq %rsi, SGX_GPR_RSI(%rbx) # 2nd argument for _DkExceptionHandler()
	movq %rsi, SGX_GPR_RDX(%rbx)
	addq $SGX_CPU_CONTEXT_SIZE, SGX_GPR_RDX(%rbx) # 3rd argument for _DkExceptionHandler()
	# TODO: save EXINFO in MISC region

	# x86-64 sysv abi requires 16B alignment of stack before call instruction
	# which implies a (8 mod 16)B alignment on function entry (due to implicit
	# push %rip). Since we already aligned xsave area above, this requirement
	# is satisfied.
	subq $8, %rsi
	movq %rsi, SGX_GPR_RSP(%rbx)

	# clear RFLAGS.DF to conform to the SysV ABI, clear RFLAGS.AC to prevent
	# the #AC-fault side channel
	andq $(~(RFLAGS_DF | RFLAGS_AC)), SGX_GPR_RFLAGS(%rbx)

	# new RIP is the exception handler
	leaq _DkExceptionHandler(%rip), %rdi
	movq %rdi, SGX_GPR_RIP(%rbx)

	movq %rdx, %rbx
	leaq SGX_CPU_CONTEXT_SIZE + 8(%rsi), %rdi
	leaq 1f(%rip), %r11
	jmp __save_xregs
1:
	movq %rbx, %rdx

.Leexit_exception:
	# clear the registers
	xorq %rdi, %rdi
	xorq %rsi, %rsi

	# exit address in RDX, mov it to RBX
	movq %rdx, %rbx
	jmp .Lclear_and_eexit

	.global sgx_ocall
	.type sgx_ocall, @function

sgx_ocall:
	# arguments:
	#   RDI: OCALL number (code)
	#   RSI: OCALL args on untrusted stack (ms)
	#
	# sgx_cpu_context_t:
	#   RAX = 0: place holder
	#   RCX
	#   ...
	#   RFLAGS
	#   RIP
	# xsave area
	#   xregs
	# (padding)
	# --- stack may be non-contiguous as we may switch the stack to signal stack
	# previous RBP
	# previous RIP: pushed by callq

	.cfi_startproc
	pushq %rbp
	.cfi_adjust_cfa_offset 8
	movq %rsp, %rbp
	.cfi_offset %rbp, -16
	.cfi_def_cfa_register %rbp

	CHECK_IF_SIGNAL_STACK_IS_USED %rsp, .Lon_signal_stack_ocall, .Lout_of_signal_stack_ocall

.Lout_of_signal_stack_ocall:
	movq %gs:SGX_SIG_STACK_HIGH, %rsp

.Lon_signal_stack_ocall:
	movl xsave_size(%rip), %eax
	addq $STACK_PADDING_SIZE, %rax
	subq %rax, %rsp
	andq $~(PAL_XSTATE_ALIGN - 1), %rsp

	pushq %rdx
	pushq %rdi
	movq %rsp, %rdi
	addq $2 * 8, %rdi # adjust pushq %rdx; pushq %rdi above
	callq save_xregs
	popq %rdi
	popq %rdx

	movq 8(%rbp), %rax
	pushq %rax # previous RIP
	pushfq

	# Under GDB, single-stepping sets Trap Flag (TP) of EFLAGS,
	# thus TP=1 is stored on pushfq above. Upon consequent popfq,
	# TP is 1, resulting in spurious trap. Reset TP here.
	andq $~0x100, (%rsp)

	pushq %r15
	pushq %r14
	pushq %r13
	pushq %r12
	pushq %r11
	pushq %r10
	pushq %r9
	pushq %r8
	pushq %rdi
	pushq %rsi
	movq (%rbp), %rax
	pushq %rax # previous RBP
	leaq 16(%rbp), %rax
	pushq %rax # previous RSP
	pushq %rbx
	pushq %rdx
	pushq %rcx
	pushq $0 # placeholder for RAX

	# OCALL_EXIT should never return (see sgx_ocall_exit(): it always exits
	# the thread). Skip setting SGX_PRE_OCALL_STACK to land in special-case
	# of ECALL_THREAD_RESET (issued in sgx_ocall_exit()) later. Note that if
	# there is an interrupt (which usually would result in a simulated
	# return of -EINTR), it will be silently ignored via
	# .Lignore_exception.
	cmpq $OCALL_EXIT, %rdi
	jne 1f
	movq $1, %gs:SGX_OCALL_EXIT_CALLED
	jmp .Locall_about_to_eexit_begin
1:

	movq %rsp, %gs:SGX_PRE_OCALL_STACK

.Locall_about_to_eexit_begin:
	# From here .Lhandle_exception can mess with our state (%rip and %rsp).
	# We therefore need to be extremely careful when making changes here.
	#
	# It's ok to use the untrusted stack and exit target below without
	# checks since the processor will ensure that after exiting enclave
	# mode in-enclave memory can't be accessed.

	movq %gs:SGX_USTACK, %rsp

#ifdef DEBUG
	# Push %rip of some code inside __morestack() on untrusted stack.
	# At sgx_entry(), GDB deduces saved_rip by looking at CFA-8 = %rsp.
	leaq .Lfor_cfa_debug_info(%rip), %r8
	pushq %r8
#endif

	movq %gs:SGX_EXIT_TARGET, %rbx
	.cfi_endproc
	# fallthrough

	# Clear other registers and similar state and then call EEXIT
	#
	# Arguments for EEXIT/untrusted code (not cleared):
	#
	#     %rbx: exit target
	#     %rsp: untrusted stack
	#     %rdi, %rsi: (optional) arguments to untrusted code.
.Lclear_and_eexit:

#ifdef DEBUG
	# Enclave and untrusted stacks are split (segmented). GDB refuses to
	# unwind such stacks because it looks like stack frames "jump" back
	# and forth. Luckily, GDB special-cases stack frames for a function
	# with hardcoded name "__morestack". Declare this dummy function
	# to make GDB happy.

	.global __morestack
	.type __morestack, @function
__morestack:
#endif

	.cfi_startproc

	# Clear "extended" state (FPU aka x87, SSE, AVX, ...).

	# pal_sec.enclave_attributes.xfrm will always be zero before
	# init_enclave has been called by pal_linux_main. So during early init
	# nothing should use features not covered by fxrstor, like AVX.

	movq %rdi, %r10
	leaq xsave_reset_state(%rip), %rdi
	leaq 1f(%rip), %r11
	jmp __restore_xregs
1:
	movq %r10, %rdi
2:

	# %rax is argument to EEXIT
	# %rbx is argument to EEXIT
	# %rcx is set to AEP by EEXIT
	xorq %rdx, %rdx
	# %rsi, %rdi are arguments to the untrusted code

#ifdef DEBUG
.Lfor_cfa_debug_info:
	# Leave %rbp pointing to OCALL function on trusted stack.
#else
	# In non-debug mode, clear %rbp to not leak trusted stack address.
	xorq %rbp, %rbp
#endif

	# %rsp points to untrusted stack
	xorq %r8, %r8
	xorq %r9, %r9
	xorq %r10, %r10
	xorq %r11, %r11
	xorq %r12, %r12
	xorq %r13, %r13
	xorq %r14, %r14
	subq %r15, %r15 # use sub to set flags to a fixed value

	movq $EEXIT, %rax
	ENCLU
.Locall_about_to_eexit_end:

	ud2 # We should never get here.
	.cfi_endproc

.Lreturn_from_ocall:
	# PAL convention:
	# RDI - return value
	# RSI - external event (if there is any)

	# restore the stack
	movq %gs:SGX_PRE_OCALL_STACK, %rsp
	movq $0, %gs:SGX_PRE_OCALL_STACK
.Lreturn_from_ocall_after_stack_restore:

	# sgx_cpu_context_t::rax = %rdi
	movq %rdi, SGX_CPU_CONTEXT_RAX(%rsp) # return value

	# restore FSBASE if necessary
	movq %gs:SGX_FSBASE, %rbx
	cmpq $0, %rbx
	je .Lno_fsbase
	.byte 0xf3, 0x48, 0x0f, 0xae, 0xd3 /* WRFSBASE %RBX */
.Lno_fsbase:

	# Check if there was a signal
	cmpq $0, %rsi
	jne .Lexternal_event
	movq %rsp, %rdi # %rdi = sgx_cpu_context_t* uc
	movq %rsp, %rsi
	addq $SGX_CPU_CONTEXT_SIZE, %rsi # %rsi = PAL_XREGS_STATE* xregs_state
	# _restore_sgx_context restores rflags and fp registers. So we don't have to
	# sanitize them like below.
	jmp _restore_sgx_context
	# NOTREACHED

.Lexternal_event:
	# clear the Alignment Check flag (%rFLAGS.AC) to prevent #AC-fault side channel;
	pushfq
	andq $(~RFLAGS_AC), (%rsp)
	popfq

	leaq xsave_reset_state(%rip), %rdi
	callq restore_xregs

	movq %rsi, %rdi # 1st argument = PAL_NUM event
	movq %rsp, %rsi # 2nd argument = sgx_cpu_context_t* uc
	leaq SGX_CPU_CONTEXT_SIZE(%rsp), %rdx # 3rd argument = PAL_XREGS_STATE* xregs_state
	callq _DkHandleExternalEvent
	# NOTREACHED

	# noreturn void _restore_sgx_context(sgx_cpu_context_t* uc, PAL_XREGS_STATE* xsave_area);
	# Restore an sgx_cpu_context_t as generated by .Lhandle_exception. Execution will
	# continue as specified by the rip in the context.
	# If RDI (uc) points into the signal stack we need to ensure that
	# until the last read from there RSP points there or
	# .Lsetup_exception_handler might mess with it because it would think
	# that the signal stack is not in use. In this case we assume that RSP
	# points into the signal stack when we get called.
	# (Also keep the redzone in mind, see asserts for sgx_cpu_context_t in sgx_arch.h)
	.global _restore_sgx_context
	.type _restore_sgx_context, @function
_restore_sgx_context:
	.cfi_startproc
	xchgq %rdi, %rsi
	callq restore_xregs

	movq %rsi, %r15

	movq SGX_CPU_CONTEXT_RAX(%r15), %rax
	movq SGX_CPU_CONTEXT_RCX(%r15), %rcx
	movq SGX_CPU_CONTEXT_RDX(%r15), %rdx
	movq SGX_CPU_CONTEXT_RBX(%r15), %rbx
	# For %rsp see below.
	movq SGX_CPU_CONTEXT_RBP(%r15), %rbp
	movq SGX_CPU_CONTEXT_RSI(%r15), %rsi
	movq SGX_CPU_CONTEXT_RDI(%r15), %rdi
	movq SGX_CPU_CONTEXT_R8(%r15), %r8
	movq SGX_CPU_CONTEXT_R9(%r15), %r9
	movq SGX_CPU_CONTEXT_R10(%r15), %r10
	movq SGX_CPU_CONTEXT_R11(%r15), %r11
	movq SGX_CPU_CONTEXT_R12(%r15), %r12
	movq SGX_CPU_CONTEXT_R13(%r15), %r13
	movq SGX_CPU_CONTEXT_R14(%r15), %r14
	# R15 will be restored below

	leaq SGX_CPU_CONTEXT_RFLAGS(%r15), %rsp
	popfq

	# See the comment at .Lsetup_exception_handler.
	#
	# The use of SGX_TMP_RIP (enclave_tls::tmp_rip per-enclave-thread field) must be atomic.
	# Consider a data race:
	# (1) thread handles a previous exception in SSA=0,
	# (2) thread is done and returns from exception handler via restore_sgx_context(),
	# (3) in the middle of _restore_sgx_context() a new exception arrives,
	# (4) the exception handler for this new exception is prepared in SSA=1,
	# (5) thread returns back to SSA=0 and handles this new exception,
	# (6) thread is done and returns from exception handler via _restore_sgx_context()
	# and updates SGX_TMP_RIP (overwrites enclave_tls::tmp_rip). Now the thread returned in
	# the middle of _restore_sgx_context() and will try to jmp *%gs:SGX_TMP_RIP but this value
	# is lost, and SIGILL/SEGFAULT follows.
	#
	# The last 4 instructions that restore RIP, RSP and R15 (needed
	# as tmp reg) need to be atomic from the point of view of
	# .Lsetup_exception_handler.
	#
	# The reason is that .Lsetup_exception_handler can interrupt us in the
	# middle and the nested exception handler that it injects would mess
	# with %gs:SGX_TMP_RIP when it calls us to return (%gs:SGX_TMP_RIP is a
	# single memory location per thread, so not re-entry save).
	#
	# Since they are not atomic, .Lsetup_exception_handler will emulate this
	# behavior if it gets called while executing them (see there).

	# RSP currently points to RIP so need relative addressing to restore RIP, R15, and RSP
	movq SGX_CPU_CONTEXT_RIP - SGX_CPU_CONTEXT_RIP(%rsp), %r15
	movq %r15, %gs:SGX_TMP_RIP
.Ltmp_rip_saved0:
	movq SGX_CPU_CONTEXT_R15 - SGX_CPU_CONTEXT_RIP(%rsp), %r15
.Ltmp_rip_saved1:
	movq SGX_CPU_CONTEXT_RSP - SGX_CPU_CONTEXT_RIP(%rsp), %rsp
.Ltmp_rip_saved2:
	jmp *%gs:SGX_TMP_RIP
	.cfi_endproc

	# void __save_xregs(PAL_XREGS_STATE* xsave_area)
	#   RDI: argument: pointer to xsave_area
	#   R11: return address: in order to not touch stack
	#                        In some situations, stack isn't available.
	#   RAX, RDX: clobbered
	.global __save_xregs
	.type __save_xregs, @function
__save_xregs:
	.cfi_startproc
	movl xsave_enabled(%rip), %eax
	cmpl $0, %eax
	jz 1f

	# clear xsave header
	movq $0, XSAVE_HEADER_OFFSET + 0 * 8(%rdi)
	movq $0, XSAVE_HEADER_OFFSET + 1 * 8(%rdi)
	movq $0, XSAVE_HEADER_OFFSET + 2 * 8(%rdi)
	movq $0, XSAVE_HEADER_OFFSET + 3 * 8(%rdi)
	movq $0, XSAVE_HEADER_OFFSET + 4 * 8(%rdi)
	movq $0, XSAVE_HEADER_OFFSET + 5 * 8(%rdi)
	movq $0, XSAVE_HEADER_OFFSET + 6 * 8(%rdi)
	movq $0, XSAVE_HEADER_OFFSET + 7 * 8(%rdi)

	movl $0xffffffff, %eax
	movl $0xffffffff, %edx
	xsave64 (%rdi)
	jmp *%r11
1:
	fxsave64 (%rdi)
	jmp *%r11
	.cfi_endproc

	# void save_xregs(PAL_XREGS_STATE* xsave_area)
	.global save_xregs
	.type save_xregs, @function
save_xregs:
	.cfi_startproc
	popq %r11
	jmp __save_xregs
	.cfi_endproc

	# void restore_xregs(const PAL_XREGS_STATE* xsave_area)
	#   RDI: argument: pointer to xsave_area
	#   R11: return address: in order to not touch stack
	#                        In some situations, stack isn't available.
	#   RAX, RDX: clobbered
	.global __restore_xregs
	.type __restore_xregs, @function
__restore_xregs:
	.cfi_startproc
	movl xsave_enabled(%rip), %eax
	cmpl $0, %eax
	jz 1f

	movl $0xffffffff, %eax
	movl $0xffffffff, %edx
	xrstor64 (%rdi)
	jmp *%r11
1:
	fxrstor64 (%rdi)
	jmp *%r11
	.cfi_endproc

	# void restore_xregs(const PAL_XREGS_STATE* xsave_area)
	.global restore_xregs
	.type restore_xregs, @function
restore_xregs:
	.cfi_startproc
	popq %r11
	jmp __restore_xregs
	.cfi_endproc