xref: /php-src/ext/opcache/jit/ir/dynasm/dasm_x86.lua (revision 2ab1c3d5)
1------------------------------------------------------------------------------
2-- DynASM x86/x64 module.
3--
4-- Copyright (C) 2005-2023 Mike Pall. All rights reserved.
5-- See dynasm.lua for full copyright notice.
6------------------------------------------------------------------------------
7
8local x64 = x64
9
10-- Module information:
11local _info = {
12  arch =	x64 and "x64" or "x86",
13  description =	"DynASM x86/x64 module",
14  version =	"1.5.0",
15  vernum =	 10500,
16  release =	"2021-05-02",
17  author =	"Mike Pall",
18  license =	"MIT",
19}
20
21-- Exported glue functions for the arch-specific module.
22local _M = { _info = _info }
23
24-- Cache library functions.
25local type, tonumber, pairs, ipairs = type, tonumber, pairs, ipairs
26local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatable
27local _s = string
28local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
29local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
30local concat, sort, remove = table.concat, table.sort, table.remove
31local bit = bit or require("bit")
32local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
33
34-- Inherited tables and callbacks.
35local g_opt, g_arch
36local wline, werror, wfatal, wwarn
37
38-- Action name list.
39-- CHECK: Keep this in sync with the C code!
40local action_names = {
41  -- int arg, 1 buffer pos:
42  "DISP",  "IMM_S", "IMM_B", "IMM_W", "IMM_D",  "IMM_WB", "IMM_DB",
43  -- action arg (1 byte), int arg, 1 buffer pos (reg/num):
44  "VREG", "SPACE",
45  -- ptrdiff_t arg, 1 buffer pos (address): !x64
46  "SETLABEL", "REL_A",
47  -- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
48  "REL_LG", "REL_PC",
49  -- action arg (1 byte) or int arg, 1 buffer pos (link):
50  "IMM_LG", "IMM_PC",
51  -- action arg (1 byte) or int arg, 1 buffer pos (offset):
52  "LABEL_LG", "LABEL_PC",
53  -- action arg (1 byte), 1 buffer pos (offset):
54  "ALIGN",
55  -- action args (2 bytes), no buffer pos.
56  "EXTERN",
57  -- action arg (1 byte), no buffer pos.
58  "ESC",
59  -- no action arg, no buffer pos.
60  "MARK",
61  -- action arg (1 byte), no buffer pos, terminal action:
62  "SECTION",
63  -- no args, no buffer pos, terminal action:
64  "STOP"
65}
66
67-- Maximum number of section buffer positions for dasm_put().
68-- CHECK: Keep this in sync with the C code!
69local maxsecpos = 25 -- Keep this low, to avoid excessively long C lines.
70
71-- Action name -> action number (dynamically generated below).
72local map_action = {}
73-- First action number. Everything below does not need to be escaped.
74local actfirst = 256-#action_names
75
76-- Action list buffer and string (only used to remove dupes).
77local actlist = {}
78local actstr = ""
79
80-- Argument list for next dasm_put(). Start with offset 0 into action list.
81local actargs = { 0 }
82
83-- Current number of section buffer positions for dasm_put().
84local secpos = 1
85
86-- VREG kind encodings, pre-shifted by 5 bits.
87local map_vreg = {
88  ["modrm.rm.m"] = 0x00,
89  ["modrm.rm.r"] = 0x20,
90  ["opcode"] =     0x20,
91  ["sib.base"] =   0x20,
92  ["sib.index"] =  0x40,
93  ["modrm.reg"] =  0x80,
94  ["vex.v"] =      0xa0,
95  ["imm.hi"] =     0xc0,
96}
97
98-- Current number of VREG actions contributing to REX/VEX shrinkage.
99local vreg_shrink_count = 0
100
101------------------------------------------------------------------------------
102
103-- Compute action numbers for action names.
104for n,name in ipairs(action_names) do
105  local num = actfirst + n - 1
106  map_action[name] = num
107end
108
109-- Dump action names and numbers.
110local function dumpactions(out)
111  out:write("DynASM encoding engine action codes:\n")
112  for n,name in ipairs(action_names) do
113    local num = map_action[name]
114    out:write(format("  %-10s %02X  %d\n", name, num, num))
115  end
116  out:write("\n")
117end
118
119-- Write action list buffer as a huge static C array.
120local function writeactions(out, name)
121  local nn = #actlist
122  local last = actlist[nn] or 255
123  actlist[nn] = nil -- Remove last byte.
124  if nn == 0 then nn = 1 end
125  out:write("static const unsigned char ", name, "[", nn, "] = {\n")
126  local s = "  "
127  for n,b in ipairs(actlist) do
128    s = s..b..","
129    if #s >= 75 then
130      assert(out:write(s, "\n"))
131      s = "  "
132    end
133  end
134  out:write(s, last, "\n};\n\n") -- Add last byte back.
135end
136
137------------------------------------------------------------------------------
138
139-- Add byte to action list.
140local function wputxb(n)
141  assert(n >= 0 and n <= 255 and n % 1 == 0, "byte out of range")
142  actlist[#actlist+1] = n
143end
144
145-- Add action to list with optional arg. Advance buffer pos, too.
146local function waction(action, a, num)
147  wputxb(assert(map_action[action], "bad action name `"..action.."'"))
148  if a then actargs[#actargs+1] = a end
149  if a or num then secpos = secpos + (num or 1) end
150end
151
152-- Optionally add a VREG action.
153local function wvreg(kind, vreg, psz, sk, defer)
154  if not vreg then return end
155  waction("VREG", vreg)
156  local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'")
157  if b < (sk or 0) then
158    vreg_shrink_count = vreg_shrink_count + 1
159  end
160  if not defer then
161    b = b + vreg_shrink_count * 8
162    vreg_shrink_count = 0
163  end
164  wputxb(b + (psz or 0))
165end
166
167-- Add call to embedded DynASM C code.
168local function wcall(func, args)
169  wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true)
170end
171
172-- Delete duplicate action list chunks. A tad slow, but so what.
173local function dedupechunk(offset)
174  local al, as = actlist, actstr
175  local chunk = char(unpack(al, offset+1, #al))
176  local orig = find(as, chunk, 1, true)
177  if orig then
178    actargs[1] = orig-1 -- Replace with original offset.
179    for i=offset+1,#al do al[i] = nil end -- Kill dupe.
180  else
181    actstr = as..chunk
182  end
183end
184
185-- Flush action list (intervening C code or buffer pos overflow).
186local function wflush(term)
187  local offset = actargs[1]
188  if #actlist == offset then return end -- Nothing to flush.
189  if not term then waction("STOP") end -- Terminate action list.
190  dedupechunk(offset)
191  wcall("put", actargs) -- Add call to dasm_put().
192  actargs = { #actlist } -- Actionlist offset is 1st arg to next dasm_put().
193  secpos = 1 -- The actionlist offset occupies a buffer position, too.
194end
195
196-- Put escaped byte.
197local function wputb(n)
198  if n >= actfirst then waction("ESC") end -- Need to escape byte.
199  wputxb(n)
200end
201
202------------------------------------------------------------------------------
203
204-- Global label name -> global label number. With auto assignment on 1st use.
205local next_global = 10
206local map_global = setmetatable({}, { __index = function(t, name)
207  if not match(name, "^[%a_][%w_@]*$") then werror("bad global label") end
208  local n = next_global
209  if n > 246 then werror("too many global labels") end
210  next_global = n + 1
211  t[name] = n
212  return n
213end})
214
215-- Dump global labels.
216local function dumpglobals(out, lvl)
217  local t = {}
218  for name, n in pairs(map_global) do t[n] = name end
219  out:write("Global labels:\n")
220  for i=10,next_global-1 do
221    out:write(format("  %s\n", t[i]))
222  end
223  out:write("\n")
224end
225
226-- Write global label enum.
227local function writeglobals(out, prefix)
228  local t = {}
229  for name, n in pairs(map_global) do t[n] = name end
230  out:write("enum {\n")
231  for i=10,next_global-1 do
232    out:write("  ", prefix, gsub(t[i], "@.*", ""), ",\n")
233  end
234  out:write("  ", prefix, "_MAX\n};\n")
235end
236
237-- Write global label names.
238local function writeglobalnames(out, name)
239  local t = {}
240  for name, n in pairs(map_global) do t[n] = name end
241  out:write("static const char *const ", name, "[] = {\n")
242  for i=10,next_global-1 do
243    out:write("  \"", t[i], "\",\n")
244  end
245  out:write("  (const char *)0\n};\n")
246end
247
248------------------------------------------------------------------------------
249
250-- Extern label name -> extern label number. With auto assignment on 1st use.
251local next_extern = -1
252local map_extern = setmetatable({}, { __index = function(t, name)
253  -- No restrictions on the name for now.
254  local n = next_extern
255  if n < -256 then werror("too many extern labels") end
256  next_extern = n - 1
257  t[name] = n
258  return n
259end})
260
261-- Dump extern labels.
262local function dumpexterns(out, lvl)
263  local t = {}
264  for name, n in pairs(map_extern) do t[-n] = name end
265  out:write("Extern labels:\n")
266  for i=1,-next_extern-1 do
267    out:write(format("  %s\n", t[i]))
268  end
269  out:write("\n")
270end
271
272-- Write extern label names.
273local function writeexternnames(out, name)
274  local t = {}
275  for name, n in pairs(map_extern) do t[-n] = name end
276  out:write("static const char *const ", name, "[] = {\n")
277  for i=1,-next_extern-1 do
278    out:write("  \"", t[i], "\",\n")
279  end
280  out:write("  (const char *)0\n};\n")
281end
282
283------------------------------------------------------------------------------
284
285-- Arch-specific maps.
286local map_archdef = {}		-- Ext. register name -> int. name.
287local map_reg_rev = {}		-- Int. register name -> ext. name.
288local map_reg_num = {}		-- Int. register name -> register number.
289local map_reg_opsize = {}	-- Int. register name -> operand size.
290local map_reg_valid_base = {}	-- Int. register name -> valid base register?
291local map_reg_valid_index = {}	-- Int. register name -> valid index register?
292local map_reg_needrex = {}	-- Int. register name -> need rex vs. no rex.
293local reg_list = {}		-- Canonical list of int. register names.
294
295local map_type = {}		-- Type name -> { ctype, reg }
296local ctypenum = 0		-- Type number (for _PTx macros).
297
298local addrsize = x64 and "q" or "d"	-- Size for address operands.
299
300-- Helper functions to fill register maps.
301local function mkrmap(sz, cl, names)
302  local cname = format("@%s", sz)
303  reg_list[#reg_list+1] = cname
304  map_archdef[cl] = cname
305  map_reg_rev[cname] = cl
306  map_reg_num[cname] = -1
307  map_reg_opsize[cname] = sz
308  if sz == addrsize or sz == "d" then
309    map_reg_valid_base[cname] = true
310    map_reg_valid_index[cname] = true
311  end
312  if names then
313    for n,name in ipairs(names) do
314      local iname = format("@%s%x", sz, n-1)
315      reg_list[#reg_list+1] = iname
316      map_archdef[name] = iname
317      map_reg_rev[iname] = name
318      map_reg_num[iname] = n-1
319      map_reg_opsize[iname] = sz
320      if sz == "b" and n > 4 then map_reg_needrex[iname] = false end
321      if sz == addrsize or sz == "d" then
322	map_reg_valid_base[iname] = true
323	map_reg_valid_index[iname] = true
324      end
325    end
326  end
327  for i=0,(x64 and sz ~= "f") and 15 or 7 do
328    local needrex = sz == "b" and i > 3
329    local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
330    if needrex then map_reg_needrex[iname] = true end
331    local name
332    if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
333    elseif sz == "f" then name = format("st%d", i)
334    else name = format("r%d%s", i, sz == addrsize and "" or sz) end
335    map_archdef[name] = iname
336    if not map_reg_rev[iname] then
337      reg_list[#reg_list+1] = iname
338      map_reg_rev[iname] = name
339      map_reg_num[iname] = i
340      map_reg_opsize[iname] = sz
341      if sz == addrsize or sz == "d" then
342	map_reg_valid_base[iname] = true
343	map_reg_valid_index[iname] = true
344      end
345    end
346  end
347  reg_list[#reg_list+1] = ""
348end
349
350-- Integer registers (qword, dword, word and byte sized).
351if x64 then
352  mkrmap("q", "Rq", {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"})
353end
354mkrmap("d", "Rd", {"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi"})
355mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
356mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
357map_reg_valid_index[map_archdef.esp] = false
358if x64 then map_reg_valid_index[map_archdef.rsp] = false end
359if x64 then map_reg_needrex[map_archdef.Rb] = true end
360map_archdef["Ra"] = "@"..addrsize
361
362-- FP registers (internally tword sized, but use "f" as operand size).
363mkrmap("f", "Rf")
364
365-- SSE registers (oword sized, but qword and dword accessible).
366mkrmap("o", "xmm")
367
368-- AVX registers (yword sized, but oword, qword and dword accessible).
369mkrmap("y", "ymm")
370
371-- Operand size prefixes to codes.
372local map_opsize = {
373  byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
374  tword = "t", aword = addrsize,
375}
376
377-- Operand size code to number.
378local map_opsizenum = {
379  b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
380}
381
382-- Operand size code to name.
383local map_opsizename = {
384  b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
385  t = "tword", f = "fpword",
386}
387
388-- Valid index register scale factors.
389local map_xsc = {
390  ["1"] = 0, ["2"] = 1, ["4"] = 2, ["8"] = 3,
391}
392
393-- Condition codes.
394local map_cc = {
395  o = 0, no = 1, b = 2, nb = 3, e = 4, ne = 5, be = 6, nbe = 7,
396  s = 8, ns = 9, p = 10, np = 11, l = 12, nl = 13, le = 14, nle = 15,
397  c = 2, nae = 2, nc = 3, ae = 3, z = 4, nz = 5, na = 6, a = 7,
398  pe = 10, po = 11, nge = 12, ge = 13, ng = 14, g = 15,
399}
400
401
402-- Reverse defines for registers.
403function _M.revdef(s)
404  return gsub(s, "@%w+", map_reg_rev)
405end
406
407-- Dump register names and numbers
408local function dumpregs(out)
409  out:write("Register names, sizes and internal numbers:\n")
410  for _,reg in ipairs(reg_list) do
411    if reg == "" then
412      out:write("\n")
413    else
414      local name = map_reg_rev[reg]
415      local num = map_reg_num[reg]
416      local opsize = map_opsizename[map_reg_opsize[reg]]
417      out:write(format("  %-5s %-8s %s\n", name, opsize,
418		       num < 0 and "(variable)" or num))
419    end
420  end
421end
422
423------------------------------------------------------------------------------
424
425-- Put action for label arg (IMM_LG, IMM_PC, REL_LG, REL_PC).
426local function wputlabel(aprefix, imm, num)
427  if type(imm) == "number" then
428    if imm < 0 then
429      waction("EXTERN")
430      wputxb(aprefix == "IMM_" and 0 or 1)
431      imm = -imm-1
432    else
433      waction(aprefix.."LG", nil, num);
434    end
435    wputxb(imm)
436  else
437    waction(aprefix.."PC", imm, num)
438  end
439end
440
441-- Put signed byte or arg.
442local function wputsbarg(n)
443  if type(n) == "number" then
444    if n < -128 or n > 127 then
445      werror("signed immediate byte out of range")
446    end
447    if n < 0 then n = n + 256 end
448    wputb(n)
449  else waction("IMM_S", n) end
450end
451
452-- Put unsigned byte or arg.
453local function wputbarg(n)
454  if type(n) == "number" then
455    if n < 0 or n > 255 then
456      werror("unsigned immediate byte out of range")
457    end
458    wputb(n)
459  else waction("IMM_B", n) end
460end
461
462-- Put unsigned word or arg.
463local function wputwarg(n)
464  if type(n) == "number" then
465    if shr(n, 16) ~= 0 then
466      werror("unsigned immediate word out of range")
467    end
468    wputb(band(n, 255)); wputb(shr(n, 8));
469  else waction("IMM_W", n) end
470end
471
472-- Put signed or unsigned dword or arg.
473local function wputdarg(n)
474  local tn = type(n)
475  if tn == "number" then
476    wputb(band(n, 255))
477    wputb(band(shr(n, 8), 255))
478    wputb(band(shr(n, 16), 255))
479    wputb(shr(n, 24))
480  elseif tn == "table" then
481    wputlabel("IMM_", n[1], 1)
482  else
483    waction("IMM_D", n)
484  end
485end
486
487-- Put signed or unsigned qword or arg.
488local function wputqarg(n)
489  local tn = type(n)
490  if tn == "number" then -- This is only used for numbers from -2^31..2^32-1.
491    wputb(band(n, 255))
492    wputb(band(shr(n, 8), 255))
493    wputb(band(shr(n, 16), 255))
494    wputb(shr(n, 24))
495    local sign = n < 0 and 255 or 0
496    wputb(sign); wputb(sign); wputb(sign); wputb(sign)
497  else
498    waction("IMM_D", format("(unsigned int)(%s)", n))
499    waction("IMM_D", format("(unsigned int)((unsigned long long)(%s)>>32)", n))
500  end
501end
502
503-- Put operand-size dependent number or arg (defaults to dword).
504local function wputszarg(sz, n)
505  if not sz or sz == "d" or sz == "q" then wputdarg(n)
506  elseif sz == "w" then wputwarg(n)
507  elseif sz == "b" then wputbarg(n)
508  elseif sz == "s" then wputsbarg(n)
509  else werror("bad operand size") end
510end
511
512-- Put multi-byte opcode with operand-size dependent modifications.
513local function wputop(sz, op, rex, vex, vregr, vregxb)
514  local psz, sk = 0, nil
515  if vex then
516    local tail
517    if vex.m == 1 and band(rex, 11) == 0 then
518      if x64 and vregxb then
519	sk = map_vreg["modrm.reg"]
520      else
521	wputb(0xc5)
522      tail = shl(bxor(band(rex, 4), 4), 5)
523      psz = 3
524      end
525    end
526    if not tail then
527      wputb(0xc4)
528      wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
529      tail = shl(band(rex, 8), 4)
530      psz = 4
531    end
532    local reg, vreg = 0, nil
533    if vex.v then
534      reg = vex.v.reg
535      if not reg then werror("bad vex operand") end
536      if reg < 0 then reg = 0; vreg = vex.v.vreg end
537    end
538    if sz == "y" or vex.l then tail = tail + 4 end
539    wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
540    wvreg("vex.v", vreg)
541    rex = 0
542    if op >= 256 then werror("bad vex opcode") end
543  else
544    if rex ~= 0 then
545      if not x64 then werror("bad operand size") end
546    elseif (vregr or vregxb) and x64 then
547      rex = 0x10
548      sk = map_vreg["vex.v"]
549    end
550  end
551  local r
552  if sz == "w" then wputb(102) end
553  -- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
554  if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
555  if op >= 16777216 then wputb(shr(op, 24)); op = band(op, 0xffffff) end
556  if op >= 65536 then
557    if rex ~= 0 then
558      local opc3 = band(op, 0xffff00)
559      if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then
560	wputb(64 + band(rex, 15)); rex = 0; psz = 2
561      end
562    end
563    wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1
564  end
565  if op >= 256 then
566    local b = shr(op, 8)
567    if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end
568    wputb(b); op = band(op, 255); psz = psz + 1
569  end
570  if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end
571  if sz == "b" then op = op - 1 end
572  wputb(op)
573  return psz, sk
574end
575
576-- Put ModRM or SIB formatted byte.
577local function wputmodrm(m, s, rm, vs, vrm)
578  assert(m < 4 and s < 16 and rm < 16, "bad modrm operands")
579  wputb(shl(m, 6) + shl(band(s, 7), 3) + band(rm, 7))
580end
581
582-- Put ModRM/SIB plus optional displacement.
583local function wputmrmsib(t, imark, s, vsreg, psz, sk)
584  local vreg, vxreg
585  local reg, xreg = t.reg, t.xreg
586  if reg and reg < 0 then reg = 0; vreg = t.vreg end
587  if xreg and xreg < 0 then xreg = 0; vxreg = t.vxreg end
588  if s < 0 then s = 0 end
589
590  -- Register mode.
591  if sub(t.mode, 1, 1) == "r" then
592    wputmodrm(3, s, reg)
593    wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
594    wvreg("modrm.rm.r", vreg, psz+1, sk)
595    return
596  end
597
598  local disp = t.disp
599  local tdisp = type(disp)
600  -- No base register?
601  if not reg then
602    local riprel = false
603    if xreg then
604      -- Indexed mode with index register only.
605      -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
606      wputmodrm(0, s, 4)
607      if imark == "I" then waction("MARK") end
608      wvreg("modrm.reg", vsreg, psz+1, sk, vxreg)
609      wputmodrm(t.xsc, xreg, 5)
610      wvreg("sib.index", vxreg, psz+2, sk)
611    else
612      -- Pure 32 bit displacement.
613      if x64 and tdisp ~= "table" then
614	wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp)
615	wvreg("modrm.reg", vsreg, psz+1, sk)
616	if imark == "I" then waction("MARK") end
617	wputmodrm(0, 4, 5)
618      else
619	riprel = x64
620	wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp)
621	wvreg("modrm.reg", vsreg, psz+1, sk)
622	if imark == "I" then waction("MARK") end
623      end
624    end
625    if riprel then -- Emit rip-relative displacement.
626      if match("UWSiI", imark) then
627	werror("NYI: rip-relative displacement followed by immediate")
628      end
629      -- The previous byte in the action buffer cannot be 0xe9 or 0x80-0x8f.
630      if disp[2] == "iPJ" then
631	waction("REL_A", disp[1])
632      else
633	wputlabel("REL_", disp[1], 2)
634      end
635    else
636      wputdarg(disp)
637    end
638    return
639  end
640
641  local m
642  if tdisp == "number" then -- Check displacement size at assembly time.
643    if disp == 0 and band(reg, 7) ~= 5 then -- [ebp] -> [ebp+0] (in SIB, too)
644      if not vreg then m = 0 end -- Force DISP to allow [Rd(5)] -> [ebp+0]
645    elseif disp >= -128 and disp <= 127 then m = 1
646    else m = 2 end
647  elseif tdisp == "table" then
648    m = 2
649  end
650
651  -- Index register present or esp as base register: need SIB encoding.
652  if xreg or band(reg, 7) == 4 then
653    wputmodrm(m or 2, s, 4) -- ModRM.
654    if m == nil or imark == "I" then waction("MARK") end
655    wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg)
656    wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
657    wvreg("sib.index", vxreg, psz+2, sk, vreg)
658    wvreg("sib.base", vreg, psz+2, sk)
659  else
660    wputmodrm(m or 2, s, reg) -- ModRM.
661    if (imark == "I" and (m == 1 or m == 2)) or
662       (m == nil and (vsreg or vreg)) then waction("MARK") end
663    wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
664    wvreg("modrm.rm.m", vreg, psz+1, sk)
665  end
666
667  -- Put displacement.
668  if m == 1 then wputsbarg(disp)
669  elseif m == 2 then wputdarg(disp)
670  elseif m == nil then waction("DISP", disp) end
671end
672
673------------------------------------------------------------------------------
674
675-- Return human-readable operand mode string.
676local function opmodestr(op, args)
677  local m = {}
678  for i=1,#args do
679    local a = args[i]
680    m[#m+1] = sub(a.mode, 1, 1)..(a.opsize or "?")
681  end
682  return op.." "..concat(m, ",")
683end
684
685-- Convert number to valid integer or nil.
686local function toint(expr, isqword)
687  local n = tonumber(expr)
688  if n then
689    if n % 1 ~= 0 then
690      werror("not an integer number `"..expr.."'")
691    elseif isqword then
692      if n < -2147483648 or n > 2147483647 then
693	n = nil -- Handle it as an expression to avoid precision loss.
694      end
695    elseif n < -2147483648 or n > 4294967295 then
696      werror("bad integer number `"..expr.."'")
697    end
698    return n
699  end
700end
701
702-- Parse immediate expression.
703local function immexpr(expr)
704  -- &expr (pointer)
705  if sub(expr, 1, 1) == "&" then
706    return "iPJ", format("(ptrdiff_t)(%s)", sub(expr,2))
707  end
708
709  local prefix = sub(expr, 1, 2)
710  -- =>expr (pc label reference)
711  if prefix == "=>" then
712    return "iJ", sub(expr, 3)
713  end
714  -- ->name (global label reference)
715  if prefix == "->" then
716    return "iJ", map_global[sub(expr, 3)]
717  end
718
719  -- [<>][1-9] (local label reference)
720  local dir, lnum = match(expr, "^([<>])([1-9])$")
721  if dir then -- Fwd: 247-255, Bkwd: 1-9.
722    return "iJ", lnum + (dir == ">" and 246 or 0)
723  end
724
725  local extname = match(expr, "^extern%s+(%S+)$")
726  if extname then
727    return "iJ", map_extern[extname]
728  end
729
730  -- expr (interpreted as immediate)
731  return "iI", expr
732end
733
734-- Parse displacement expression: +-num, +-expr, +-opsize*num
735local function dispexpr(expr)
736  local disp = expr == "" and 0 or toint(expr)
737  if disp then return disp end
738  local c, dispt = match(expr, "^([+-])%s*(.+)$")
739  if c == "+" then
740    expr = dispt
741  elseif not c then
742    werror("bad displacement expression `"..expr.."'")
743  end
744  local opsize, tailops = match(dispt, "^(%w+)%s*%*%s*(.+)$")
745  local ops, imm = map_opsize[opsize], toint(tailops)
746  if ops and imm then
747    if c == "-" then imm = -imm end
748    return imm*map_opsizenum[ops]
749  end
750  local mode, iexpr = immexpr(dispt)
751  if mode == "iJ" or mode == "iPJ" then
752    if c == "-" then werror("cannot invert label reference") end
753    return { iexpr, mode }
754  end
755  return expr -- Need to return original signed expression.
756end
757
758-- Parse register or type expression.
759local function rtexpr(expr)
760  if not expr then return end
761  local tname, ovreg = match(expr, "^([%w_]+):(@[%w_]+)$")
762  local tp = map_type[tname or expr]
763  if tp then
764    local reg = ovreg or tp.reg
765    local rnum = map_reg_num[reg]
766    if not rnum then
767      werror("type `"..(tname or expr).."' needs a register override")
768    end
769    if not map_reg_valid_base[reg] then
770      werror("bad base register override `"..(map_reg_rev[reg] or reg).."'")
771    end
772    return reg, rnum, tp
773  end
774  return expr, map_reg_num[expr]
775end
776
777-- Parse operand and return { mode, opsize, reg, xreg, xsc, disp, imm }.
778local function parseoperand(param, isqword)
779  local t = {}
780
781  local expr = param
782  local opsize, tailops = match(param, "^(%w+)%s*(.+)$")
783  if opsize then
784    t.opsize = map_opsize[opsize]
785    if t.opsize then expr = tailops end
786  end
787
788  local br = match(expr, "^%[%s*(.-)%s*%]$")
789  repeat
790    if br then
791      t.mode = "xm"
792
793      -- [disp]
794      t.disp = toint(br)
795      if t.disp then
796	t.mode = x64 and "xm" or "xmO"
797	break
798      end
799
800      -- [reg...]
801      local tp
802      local reg, tailr = match(br, "^([@%w_:]+)%s*(.*)$")
803      reg, t.reg, tp = rtexpr(reg)
804      if not t.reg then
805	-- [expr]
806	t.mode = x64 and "xm" or "xmO"
807	t.disp = dispexpr("+"..br)
808	break
809      end
810
811      if t.reg == -1 then
812	t.vreg, tailr = match(tailr, "^(%b())(.*)$")
813	if not t.vreg then werror("bad variable register expression") end
814      end
815
816      -- [xreg*xsc] or [xreg*xsc+-disp] or [xreg*xsc+-expr]
817      local xsc, tailsc = match(tailr, "^%*%s*([1248])%s*(.*)$")
818      if xsc then
819	if not map_reg_valid_index[reg] then
820	  werror("bad index register `"..map_reg_rev[reg].."'")
821	end
822	t.xsc = map_xsc[xsc]
823	t.xreg = t.reg
824	t.vxreg = t.vreg
825	t.reg = nil
826	t.vreg = nil
827	t.disp = dispexpr(tailsc)
828	break
829      end
830      if not map_reg_valid_base[reg] then
831	werror("bad base register `"..map_reg_rev[reg].."'")
832      end
833
834      -- [reg] or [reg+-disp]
835      t.disp = toint(tailr) or (tailr == "" and 0)
836      if t.disp then break end
837
838      -- [reg+xreg...]
839      local xreg, tailx = match(tailr, "^%+%s*([@%w_:]+)%s*(.*)$")
840      xreg, t.xreg, tp = rtexpr(xreg)
841      if not t.xreg then
842	-- [reg+-expr]
843	t.disp = dispexpr(tailr)
844	break
845      end
846      if not map_reg_valid_index[xreg] then
847	werror("bad index register `"..map_reg_rev[xreg].."'")
848      end
849
850      if t.xreg == -1 then
851	t.vxreg, tailx = match(tailx, "^(%b())(.*)$")
852	if not t.vxreg then werror("bad variable register expression") end
853      end
854
855      -- [reg+xreg*xsc...]
856      local xsc, tailsc = match(tailx, "^%*%s*([1248])%s*(.*)$")
857      if xsc then
858	t.xsc = map_xsc[xsc]
859	tailx = tailsc
860      end
861
862      -- [...] or [...+-disp] or [...+-expr]
863      t.disp = dispexpr(tailx)
864    else
865      -- imm or opsize*imm
866      local imm = toint(expr, isqword)
867      if not imm and sub(expr, 1, 1) == "*" and t.opsize then
868	imm = toint(sub(expr, 2))
869	if imm then
870	  imm = imm * map_opsizenum[t.opsize]
871	  t.opsize = nil
872	end
873      end
874      if imm then
875	if t.opsize then werror("bad operand size override") end
876	local m = "i"
877	if imm == 1 then m = m.."1" end
878	if imm >= 4294967168 and imm <= 4294967295 then imm = imm-4294967296 end
879	if imm >= -128 and imm <= 127 then m = m.."S" end
880	t.imm = imm
881	t.mode = m
882	break
883      end
884
885      local tp
886      local reg, tailr = match(expr, "^([@%w_:]+)%s*(.*)$")
887      reg, t.reg, tp = rtexpr(reg)
888      if t.reg then
889	if t.reg == -1 then
890	  t.vreg, tailr = match(tailr, "^(%b())(.*)$")
891	  if not t.vreg then werror("bad variable register expression") end
892	end
893	-- reg
894	if tailr == "" then
895	  if t.opsize then werror("bad operand size override") end
896	  t.opsize = map_reg_opsize[reg]
897	  if t.opsize == "f" then
898	    t.mode = t.reg == 0 and "fF" or "f"
899	  else
900	    if reg == "@w4" or (x64 and reg == "@d4") then
901	      wwarn("bad idea, try again with `"..(x64 and "rsp'" or "esp'"))
902	    end
903	    t.mode = t.reg == 0 and "rmR" or (reg == "@b1" and "rmC" or "rm")
904	  end
905	  t.needrex = map_reg_needrex[reg]
906	  break
907	end
908
909	-- type[idx], type[idx].field, type->field -> [reg+offset_expr]
910	if not tp then werror("bad operand `"..param.."'") end
911	t.mode = "xm"
912	t.disp = format(tp.ctypefmt, tailr)
913      else
914	t.mode, t.imm = immexpr(expr)
915	if sub(t.mode, -1) == "J" then
916	  if t.opsize and t.opsize ~= addrsize then
917	    werror("bad operand size override")
918	  end
919	  t.opsize = addrsize
920	end
921      end
922    end
923  until true
924  return t
925end
926
927------------------------------------------------------------------------------
928-- x86 Template String Description
929-- ===============================
930--
931-- Each template string is a list of [match:]pattern pairs,
932-- separated by "|". The first match wins. No match means a
933-- bad or unsupported combination of operand modes or sizes.
934--
935-- The match part and the ":" is omitted if the operation has
936-- no operands. Otherwise the first N characters are matched
937-- against the mode strings of each of the N operands.
938--
939-- The mode string for each operand type is (see parseoperand()):
940--   Integer register: "rm", +"R" for eax, ax, al, +"C" for cl
941--   FP register:      "f",  +"F" for st0
942--   Index operand:    "xm", +"O" for [disp] (pure offset)
943--   Immediate:        "i",  +"S" for signed 8 bit, +"1" for 1,
944--                     +"I" for arg, +"P" for pointer
945--   Any:              +"J" for valid jump targets
946--
947-- So a match character "m" (mixed) matches both an integer register
948-- and an index operand (to be encoded with the ModRM/SIB scheme).
949-- But "r" matches only a register and "x" only an index operand
950-- (e.g. for FP memory access operations).
951--
952-- The operand size match string starts right after the mode match
953-- characters and ends before the ":". "dwb" or "qdwb" is assumed, if empty.
954-- The effective data size of the operation is matched against this list.
955--
956-- If only the regular "b", "w", "d", "q", "t" operand sizes are
957-- present, then all operands must be the same size. Unspecified sizes
958-- are ignored, but at least one operand must have a size or the pattern
959-- won't match (use the "byte", "word", "dword", "qword", "tword"
960-- operand size overrides. E.g.: mov dword [eax], 1).
961--
962-- If the list has a "1" or "2" prefix, the operand size is taken
963-- from the respective operand and any other operand sizes are ignored.
964-- If the list contains only ".", all operand sizes are ignored.
965-- If the list has a "/" prefix, the concatenated (mixed) operand sizes
966-- are compared to the match.
967--
968-- E.g. "rrdw" matches for either two dword registers or two word
969-- registers. "Fx2dq" matches an st0 operand plus an index operand
970-- pointing to a dword (float) or qword (double).
971--
972-- Every character after the ":" is part of the pattern string:
973--   Hex chars are accumulated to form the opcode (left to right).
974--   "n"       disables the standard opcode mods
975--             (otherwise: -1 for "b", o16 prefix for "w", rex.w for "q")
976--   "X"       Force REX.W.
977--   "r"/"R"   adds the reg. number from the 1st/2nd operand to the opcode.
978--   "m"/"M"   generates ModRM/SIB from the 1st/2nd operand.
979--             The spare 3 bits are either filled with the last hex digit or
980--             the result from a previous "r"/"R". The opcode is restored.
981--   "u"       Use VEX encoding, vvvv unused.
982--   "v"/"V"   Use VEX encoding, vvvv from 1st/2nd operand (the operand is
983--             removed from the list used by future characters).
984--   "w"       Use VEX encoding, vvvv from 3rd operand.
985--   "L"       Force VEX.L
986--
987-- All of the following characters force a flush of the opcode:
988--   "o"/"O"   stores a pure 32 bit disp (offset) from the 1st/2nd operand.
989--   "s"       stores a 4 bit immediate from the last register operand,
990--             followed by 4 zero bits.
991--   "S"       stores a signed 8 bit immediate from the last operand.
992--   "U"       stores an unsigned 8 bit immediate from the last operand.
993--   "W"       stores an unsigned 16 bit immediate from the last operand.
994--   "i"       stores an operand sized immediate from the last operand.
995--   "I"       dito, but generates an action code to optionally modify
996--             the opcode (+2) for a signed 8 bit immediate.
997--   "J"       generates one of the REL action codes from the last operand.
998--
999------------------------------------------------------------------------------
1000
1001-- Template strings for x86 instructions. Ordered by first opcode byte.
1002-- Unimplemented opcodes (deliberate omissions) are marked with *.
1003local map_op = {
1004  -- 00-05: add...
1005  -- 06: *push es
1006  -- 07: *pop es
1007  -- 08-0D: or...
1008  -- 0E: *push cs
1009  -- 0F: two byte opcode prefix
1010  -- 10-15: adc...
1011  -- 16: *push ss
1012  -- 17: *pop ss
1013  -- 18-1D: sbb...
1014  -- 1E: *push ds
1015  -- 1F: *pop ds
1016  -- 20-25: and...
1017  es_0 =	"26",
1018  -- 27: *daa
1019  -- 28-2D: sub...
1020  cs_0 =	"2E",
1021  -- 2F: *das
1022  -- 30-35: xor...
1023  ss_0 =	"36",
1024  -- 37: *aaa
1025  -- 38-3D: cmp...
1026  ds_0 =	"3E",
1027  -- 3F: *aas
1028  inc_1 =	x64 and "m:FF0m" or "rdw:40r|m:FF0m",
1029  dec_1 =	x64 and "m:FF1m" or "rdw:48r|m:FF1m",
1030  push_1 =	(x64 and "rq:n50r|rw:50r|mq:nFF6m|mw:FF6m" or
1031			 "rdw:50r|mdw:FF6m").."|S.:6AS|ib:n6Ai|i.:68i",
1032  pop_1 =	x64 and "rq:n58r|rw:58r|mq:n8F0m|mw:8F0m" or "rdw:58r|mdw:8F0m",
1033  -- 60: *pusha, *pushad, *pushaw
1034  -- 61: *popa, *popad, *popaw
1035  -- 62: *bound rdw,x
1036  -- 63: x86: *arpl mw,rw
1037  movsxd_2 =	x64 and "rm/qd:63rM",
1038  fs_0 =	"64",
1039  gs_0 =	"65",
1040  o16_0 =	"66",
1041  a16_0 =	not x64 and "67" or nil,
1042  a32_0 =	x64 and "67",
1043  -- 68: push idw
1044  -- 69: imul rdw,mdw,idw
1045  -- 6A: push ib
1046  -- 6B: imul rdw,mdw,S
1047  -- 6C: *insb
1048  -- 6D: *insd, *insw
1049  -- 6E: *outsb
1050  -- 6F: *outsd, *outsw
1051  -- 70-7F: jcc lb
1052  -- 80: add... mb,i
1053  -- 81: add... mdw,i
1054  -- 82: *undefined
1055  -- 83: add... mdw,S
1056  test_2 =	"mr:85Rm|rm:85rM|Ri:A9ri|mi:F70mi",
1057  -- 86: xchg rb,mb
1058  -- 87: xchg rdw,mdw
1059  -- 88: mov mb,r
1060  -- 89: mov mdw,r
1061  -- 8A: mov r,mb
1062  -- 8B: mov r,mdw
1063  -- 8C: *mov mdw,seg
1064  lea_2 =	"rx1dq:8DrM",
1065  -- 8E: *mov seg,mdw
1066  -- 8F: pop mdw
1067  nop_0 =	"90",
1068  xchg_2 =	"Rrqdw:90R|rRqdw:90r|rm:87rM|mr:87Rm",
1069  cbw_0 =	"6698",
1070  cwde_0 =	"98",
1071  cdqe_0 =	"4898",
1072  cwd_0 =	"6699",
1073  cdq_0 =	"99",
1074  cqo_0 =	"4899",
1075  -- 9A: *call iw:idw
1076  wait_0 =	"9B",
1077  fwait_0 =	"9B",
1078  pushf_0 =	"9C",
1079  pushfd_0 =	not x64 and "9C",
1080  pushfq_0 =	x64 and "9C",
1081  popf_0 =	"9D",
1082  popfd_0 =	not x64 and "9D",
1083  popfq_0 =	x64 and "9D",
1084  sahf_0 =	"9E",
1085  lahf_0 =	"9F",
1086  mov_2 =	"OR:A3o|RO:A1O|mr:89Rm|rm:8BrM|rib:nB0ri|ridw:B8ri|mi:C70mi",
1087  movsb_0 =	"A4",
1088  movsw_0 =	"66A5",
1089  movsd_0 =	"A5",
1090  cmpsb_0 =	"A6",
1091  cmpsw_0 =	"66A7",
1092  cmpsd_0 =	"A7",
1093  -- A8: test Rb,i
1094  -- A9: test Rdw,i
1095  stosb_0 =	"AA",
1096  stosw_0 =	"66AB",
1097  stosd_0 =	"AB",
1098  lodsb_0 =	"AC",
1099  lodsw_0 =	"66AD",
1100  lodsd_0 =	"AD",
1101  scasb_0 =	"AE",
1102  scasw_0 =	"66AF",
1103  scasd_0 =	"AF",
1104  -- B0-B7: mov rb,i
1105  -- B8-BF: mov rdw,i
1106  -- C0: rol... mb,i
1107  -- C1: rol... mdw,i
1108  ret_1 =	"i.:nC2W",
1109  ret_0 =	"C3",
1110  -- C4: *les rdw,mq
1111  -- C5: *lds rdw,mq
1112  -- C6: mov mb,i
1113  -- C7: mov mdw,i
1114  -- C8: *enter iw,ib
1115  leave_0 =	"C9",
1116  -- CA: *retf iw
1117  -- CB: *retf
1118  int3_0 =	"CC",
1119  int_1 =	"i.:nCDU",
1120  into_0 =	"CE",
1121  -- CF: *iret
1122  -- D0: rol... mb,1
1123  -- D1: rol... mdw,1
1124  -- D2: rol... mb,cl
1125  -- D3: rol... mb,cl
1126  -- D4: *aam ib
1127  -- D5: *aad ib
1128  -- D6: *salc
1129  -- D7: *xlat
1130  -- D8-DF: floating point ops
1131  -- E0: *loopne
1132  -- E1: *loope
1133  -- E2: *loop
1134  -- E3: *jcxz, *jecxz
1135  -- E4: *in Rb,ib
1136  -- E5: *in Rdw,ib
1137  -- E6: *out ib,Rb
1138  -- E7: *out ib,Rdw
1139  call_1 =	x64 and "mq:nFF2m|J.:E8nJ" or "md:FF2m|J.:E8J",
1140  jmp_1 =	x64 and "mq:nFF4m|J.:E9nJ" or "md:FF4m|J.:E9J", -- short: EB
1141  -- EA: *jmp iw:idw
1142  -- EB: jmp ib
1143  -- EC: *in Rb,dx
1144  -- ED: *in Rdw,dx
1145  -- EE: *out dx,Rb
1146  -- EF: *out dx,Rdw
1147  lock_0 =	"F0",
1148  int1_0 =	"F1",
1149  repne_0 =	"F2",
1150  repnz_0 =	"F2",
1151  rep_0 =	"F3",
1152  repe_0 =	"F3",
1153  repz_0 =	"F3",
1154  endbr32_0 =	"F30F1EFB",
1155  endbr64_0 =	"F30F1EFA",
1156  -- F4: *hlt
1157  cmc_0 =	"F5",
1158  -- F6: test... mb,i; div... mb
1159  -- F7: test... mdw,i; div... mdw
1160  clc_0 =	"F8",
1161  stc_0 =	"F9",
1162  -- FA: *cli
1163  cld_0 =	"FC",
1164  std_0 =	"FD",
1165  -- FE: inc... mb
1166  -- FF: inc... mdw
1167
1168  -- misc ops
1169  not_1 =	"m:F72m",
1170  neg_1 =	"m:F73m",
1171  mul_1 =	"m:F74m",
1172  imul_1 =	"m:F75m",
1173  div_1 =	"m:F76m",
1174  idiv_1 =	"m:F77m",
1175
1176  imul_2 =	"rmqdw:0FAFrM|rIqdw:69rmI|rSqdw:6BrmS|riqdw:69rmi",
1177  imul_3 =	"rmIqdw:69rMI|rmSqdw:6BrMS|rmiqdw:69rMi",
1178
1179  movzx_2 =	"rm/db:0FB6rM|rm/qb:|rm/wb:0FB6rM|rm/dw:0FB7rM|rm/qw:",
1180  movsx_2 =	"rm/db:0FBErM|rm/qb:|rm/wb:0FBErM|rm/dw:0FBFrM|rm/qw:",
1181
1182  bswap_1 =	"rqd:0FC8r",
1183  bsf_2 =	"rmqdw:0FBCrM",
1184  bsr_2 =	"rmqdw:0FBDrM",
1185  bt_2 =	"mrqdw:0FA3Rm|miqdw:0FBA4mU",
1186  btc_2 =	"mrqdw:0FBBRm|miqdw:0FBA7mU",
1187  btr_2 =	"mrqdw:0FB3Rm|miqdw:0FBA6mU",
1188  bts_2 =	"mrqdw:0FABRm|miqdw:0FBA5mU",
1189
1190  shld_3 =	"mriqdw:0FA4RmU|mrC/qq:0FA5Rm|mrC/dd:|mrC/ww:",
1191  shrd_3 =	"mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:",
1192
1193  rdtsc_0 =	"0F31", -- P1+
1194  rdpmc_0 =	"0F33", -- P6+
1195  cpuid_0 =	"0FA2", -- P1+
1196
1197  -- floating point ops
1198  fst_1 =	"ff:DDD0r|xd:D92m|xq:nDD2m",
1199  fstp_1 =	"ff:DDD8r|xd:D93m|xq:nDD3m|xt:DB7m",
1200  fld_1 =	"ff:D9C0r|xd:D90m|xq:nDD0m|xt:DB5m",
1201
1202  fpop_0 =	"DDD8", -- Alias for fstp st0.
1203
1204  fist_1 =	"xw:nDF2m|xd:DB2m",
1205  fistp_1 =	"xw:nDF3m|xd:DB3m|xq:nDF7m",
1206  fild_1 =	"xw:nDF0m|xd:DB0m|xq:nDF5m",
1207
1208  fxch_0 =	"D9C9",
1209  fxch_1 =	"ff:D9C8r",
1210  fxch_2 =	"fFf:D9C8r|Fff:D9C8R",
1211
1212  fucom_1 =	"ff:DDE0r",
1213  fucom_2 =	"Fff:DDE0R",
1214  fucomp_1 =	"ff:DDE8r",
1215  fucomp_2 =	"Fff:DDE8R",
1216  fucomi_1 =	"ff:DBE8r", -- P6+
1217  fucomi_2 =	"Fff:DBE8R", -- P6+
1218  fucomip_1 =	"ff:DFE8r", -- P6+
1219  fucomip_2 =	"Fff:DFE8R", -- P6+
1220  fcomi_1 =	"ff:DBF0r", -- P6+
1221  fcomi_2 =	"Fff:DBF0R", -- P6+
1222  fcomip_1 =	"ff:DFF0r", -- P6+
1223  fcomip_2 =	"Fff:DFF0R", -- P6+
1224  fucompp_0 =	"DAE9",
1225  fcompp_0 =	"DED9",
1226
1227  fldenv_1 =	"x.:D94m",
1228  fnstenv_1 =	"x.:D96m",
1229  fstenv_1 =	"x.:9BD96m",
1230  fldcw_1 =	"xw:nD95m",
1231  fstcw_1 =	"xw:n9BD97m",
1232  fnstcw_1 =	"xw:nD97m",
1233  fstsw_1 =	"Rw:n9BDFE0|xw:n9BDD7m",
1234  fnstsw_1 =	"Rw:nDFE0|xw:nDD7m",
1235  fclex_0 =	"9BDBE2",
1236  fnclex_0 =	"DBE2",
1237
1238  fnop_0 =	"D9D0",
1239  -- D9D1-D9DF: unassigned
1240
1241  fchs_0 =	"D9E0",
1242  fabs_0 =	"D9E1",
1243  -- D9E2: unassigned
1244  -- D9E3: unassigned
1245  ftst_0 =	"D9E4",
1246  fxam_0 =	"D9E5",
1247  -- D9E6: unassigned
1248  -- D9E7: unassigned
1249  fld1_0 =	"D9E8",
1250  fldl2t_0 =	"D9E9",
1251  fldl2e_0 =	"D9EA",
1252  fldpi_0 =	"D9EB",
1253  fldlg2_0 =	"D9EC",
1254  fldln2_0 =	"D9ED",
1255  fldz_0 =	"D9EE",
1256  -- D9EF: unassigned
1257
1258  f2xm1_0 =	"D9F0",
1259  fyl2x_0 =	"D9F1",
1260  fptan_0 =	"D9F2",
1261  fpatan_0 =	"D9F3",
1262  fxtract_0 =	"D9F4",
1263  fprem1_0 =	"D9F5",
1264  fdecstp_0 =	"D9F6",
1265  fincstp_0 =	"D9F7",
1266  fprem_0 =	"D9F8",
1267  fyl2xp1_0 =	"D9F9",
1268  fsqrt_0 =	"D9FA",
1269  fsincos_0 =	"D9FB",
1270  frndint_0 =	"D9FC",
1271  fscale_0 =	"D9FD",
1272  fsin_0 =	"D9FE",
1273  fcos_0 =	"D9FF",
1274
1275  -- SSE, SSE2
1276  andnpd_2 =	"rmo:660F55rM",
1277  andnps_2 =	"rmo:0F55rM",
1278  andpd_2 =	"rmo:660F54rM",
1279  andps_2 =	"rmo:0F54rM",
1280  clflush_1 =	"x.:0FAE7m",
1281  cmppd_3 =	"rmio:660FC2rMU",
1282  cmpps_3 =	"rmio:0FC2rMU",
1283  cmpsd_3 =	"rrio:F20FC2rMU|rxi/oq:",
1284  cmpss_3 =	"rrio:F30FC2rMU|rxi/od:",
1285  comisd_2 =	"rro:660F2FrM|rx/oq:",
1286  comiss_2 =	"rro:0F2FrM|rx/od:",
1287  cvtdq2pd_2 =	"rro:F30FE6rM|rx/oq:",
1288  cvtdq2ps_2 =	"rmo:0F5BrM",
1289  cvtpd2dq_2 =	"rmo:F20FE6rM",
1290  cvtpd2ps_2 =	"rmo:660F5ArM",
1291  cvtpi2pd_2 =	"rx/oq:660F2ArM",
1292  cvtpi2ps_2 =	"rx/oq:0F2ArM",
1293  cvtps2dq_2 =	"rmo:660F5BrM",
1294  cvtps2pd_2 =	"rro:0F5ArM|rx/oq:",
1295  cvtsd2si_2 =	"rr/do:F20F2DrM|rr/qo:|rx/dq:|rxq:",
1296  cvtsd2ss_2 =	"rro:F20F5ArM|rx/oq:",
1297  cvtsi2sd_2 =	"rm/od:F20F2ArM|rm/oq:F20F2ArXM",
1298  cvtsi2ss_2 =	"rm/od:F30F2ArM|rm/oq:F30F2ArXM",
1299  cvtss2sd_2 =	"rro:F30F5ArM|rx/od:",
1300  cvtss2si_2 =	"rr/do:F30F2DrM|rr/qo:|rxd:|rx/qd:",
1301  cvttpd2dq_2 =	"rmo:660FE6rM",
1302  cvttps2dq_2 =	"rmo:F30F5BrM",
1303  cvttsd2si_2 =	"rr/do:F20F2CrM|rr/qo:|rx/dq:|rxq:",
1304  cvttss2si_2 =	"rr/do:F30F2CrM|rr/qo:|rxd:|rx/qd:",
1305  fxsave_1 =	"x.:0FAE0m",
1306  fxrstor_1 =	"x.:0FAE1m",
1307  ldmxcsr_1 =	"xd:0FAE2m",
1308  lfence_0 =	"0FAEE8",
1309  maskmovdqu_2 = "rro:660FF7rM",
1310  mfence_0 =	"0FAEF0",
1311  movapd_2 =	"rmo:660F28rM|mro:660F29Rm",
1312  movaps_2 =	"rmo:0F28rM|mro:0F29Rm",
1313  movd_2 =	"rm/od:660F6ErM|rm/oq:660F6ErXM|mr/do:660F7ERm|mr/qo:",
1314  movdqa_2 =	"rmo:660F6FrM|mro:660F7FRm",
1315  movdqu_2 =	"rmo:F30F6FrM|mro:F30F7FRm",
1316  movhlps_2 =	"rro:0F12rM",
1317  movhpd_2 =	"rx/oq:660F16rM|xr/qo:n660F17Rm",
1318  movhps_2 =	"rx/oq:0F16rM|xr/qo:n0F17Rm",
1319  movlhps_2 =	"rro:0F16rM",
1320  movlpd_2 =	"rx/oq:660F12rM|xr/qo:n660F13Rm",
1321  movlps_2 =	"rx/oq:0F12rM|xr/qo:n0F13Rm",
1322  movmskpd_2 =	"rr/do:660F50rM",
1323  movmskps_2 =	"rr/do:0F50rM",
1324  movntdq_2 =	"xro:660FE7Rm",
1325  movnti_2 =	"xrqd:0FC3Rm",
1326  movntpd_2 =	"xro:660F2BRm",
1327  movntps_2 =	"xro:0F2BRm",
1328  movq_2 =	"rro:F30F7ErM|rx/oq:|xr/qo:n660FD6Rm",
1329  movsd_2 =	"rro:F20F10rM|rx/oq:|xr/qo:nF20F11Rm",
1330  movss_2 =	"rro:F30F10rM|rx/od:|xr/do:F30F11Rm",
1331  movupd_2 =	"rmo:660F10rM|mro:660F11Rm",
1332  movups_2 =	"rmo:0F10rM|mro:0F11Rm",
1333  orpd_2 =	"rmo:660F56rM",
1334  orps_2 =	"rmo:0F56rM",
1335  pause_0 =	"F390",
1336  pextrw_3 =	"rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
1337  pinsrw_3 =	"rri/od:660FC4rMU|rxi/ow:",
1338  pmovmskb_2 =	"rr/do:660FD7rM",
1339  prefetchnta_1 = "xb:n0F180m",
1340  prefetcht0_1 = "xb:n0F181m",
1341  prefetcht1_1 = "xb:n0F182m",
1342  prefetcht2_1 = "xb:n0F183m",
1343  pshufd_3 =	"rmio:660F70rMU",
1344  pshufhw_3 =	"rmio:F30F70rMU",
1345  pshuflw_3 =	"rmio:F20F70rMU",
1346  pslld_2 =	"rmo:660FF2rM|rio:660F726mU",
1347  pslldq_2 =	"rio:660F737mU",
1348  psllq_2 =	"rmo:660FF3rM|rio:660F736mU",
1349  psllw_2 =	"rmo:660FF1rM|rio:660F716mU",
1350  psrad_2 =	"rmo:660FE2rM|rio:660F724mU",
1351  psraw_2 =	"rmo:660FE1rM|rio:660F714mU",
1352  psrld_2 =	"rmo:660FD2rM|rio:660F722mU",
1353  psrldq_2 =	"rio:660F733mU",
1354  psrlq_2 =	"rmo:660FD3rM|rio:660F732mU",
1355  psrlw_2 =	"rmo:660FD1rM|rio:660F712mU",
1356  rcpps_2 =	"rmo:0F53rM",
1357  rcpss_2 =	"rro:F30F53rM|rx/od:",
1358  rsqrtps_2 =	"rmo:0F52rM",
1359  rsqrtss_2 =	"rmo:F30F52rM",
1360  sfence_0 =	"0FAEF8",
1361  shufpd_3 =	"rmio:660FC6rMU",
1362  shufps_3 =	"rmio:0FC6rMU",
1363  stmxcsr_1 =   "xd:0FAE3m",
1364  ucomisd_2 =	"rro:660F2ErM|rx/oq:",
1365  ucomiss_2 =	"rro:0F2ErM|rx/od:",
1366  unpckhpd_2 =	"rmo:660F15rM",
1367  unpckhps_2 =	"rmo:0F15rM",
1368  unpcklpd_2 =	"rmo:660F14rM",
1369  unpcklps_2 =	"rmo:0F14rM",
1370  xorpd_2 =	"rmo:660F57rM",
1371  xorps_2 =	"rmo:0F57rM",
1372
1373  -- SSE3 ops
1374  fisttp_1 =	"xw:nDF1m|xd:DB1m|xq:nDD1m",
1375  addsubpd_2 =	"rmo:660FD0rM",
1376  addsubps_2 =	"rmo:F20FD0rM",
1377  haddpd_2 =	"rmo:660F7CrM",
1378  haddps_2 =	"rmo:F20F7CrM",
1379  hsubpd_2 =	"rmo:660F7DrM",
1380  hsubps_2 =	"rmo:F20F7DrM",
1381  lddqu_2 =	"rxo:F20FF0rM",
1382  movddup_2 =	"rmo:F20F12rM",
1383  movshdup_2 =	"rmo:F30F16rM",
1384  movsldup_2 =	"rmo:F30F12rM",
1385
1386  -- SSSE3 ops
1387  pabsb_2 =	"rmo:660F381CrM",
1388  pabsd_2 =	"rmo:660F381ErM",
1389  pabsw_2 =	"rmo:660F381DrM",
1390  palignr_3 =	"rmio:660F3A0FrMU",
1391  phaddd_2 =	"rmo:660F3802rM",
1392  phaddsw_2 =	"rmo:660F3803rM",
1393  phaddw_2 =	"rmo:660F3801rM",
1394  phsubd_2 =	"rmo:660F3806rM",
1395  phsubsw_2 =	"rmo:660F3807rM",
1396  phsubw_2 =	"rmo:660F3805rM",
1397  pmaddubsw_2 =	"rmo:660F3804rM",
1398  pmulhrsw_2 =	"rmo:660F380BrM",
1399  pshufb_2 =	"rmo:660F3800rM",
1400  psignb_2 =	"rmo:660F3808rM",
1401  psignd_2 =	"rmo:660F380ArM",
1402  psignw_2 =	"rmo:660F3809rM",
1403
1404  -- SSE4.1 ops
1405  blendpd_3 =	"rmio:660F3A0DrMU",
1406  blendps_3 =	"rmio:660F3A0CrMU",
1407  blendvpd_3 =	"rmRo:660F3815rM",
1408  blendvps_3 =	"rmRo:660F3814rM",
1409  dppd_3 =	"rmio:660F3A41rMU",
1410  dpps_3 =	"rmio:660F3A40rMU",
1411  extractps_3 =	"mri/do:660F3A17RmU|rri/qo:660F3A17RXmU",
1412  insertps_3 =	"rrio:660F3A41rMU|rxi/od:",
1413  movntdqa_2 =	"rxo:660F382ArM",
1414  mpsadbw_3 =	"rmio:660F3A42rMU",
1415  packusdw_2 =	"rmo:660F382BrM",
1416  pblendvb_3 =	"rmRo:660F3810rM",
1417  pblendw_3 =	"rmio:660F3A0ErMU",
1418  pcmpeqq_2 =	"rmo:660F3829rM",
1419  pextrb_3 =	"rri/do:660F3A14nRmU|rri/qo:|xri/bo:",
1420  pextrd_3 =	"mri/do:660F3A16RmU",
1421  pextrq_3 =	"mri/qo:660F3A16RmU",
1422  -- pextrw is SSE2, mem operand is SSE4.1 only
1423  phminposuw_2 = "rmo:660F3841rM",
1424  pinsrb_3 =	"rri/od:660F3A20nrMU|rxi/ob:",
1425  pinsrd_3 =	"rmi/od:660F3A22rMU",
1426  pinsrq_3 =	"rmi/oq:660F3A22rXMU",
1427  pmaxsb_2 =	"rmo:660F383CrM",
1428  pmaxsd_2 =	"rmo:660F383DrM",
1429  pmaxud_2 =	"rmo:660F383FrM",
1430  pmaxuw_2 =	"rmo:660F383ErM",
1431  pminsb_2 =	"rmo:660F3838rM",
1432  pminsd_2 =	"rmo:660F3839rM",
1433  pminud_2 =	"rmo:660F383BrM",
1434  pminuw_2 =	"rmo:660F383ArM",
1435  pmovsxbd_2 =	"rro:660F3821rM|rx/od:",
1436  pmovsxbq_2 =	"rro:660F3822rM|rx/ow:",
1437  pmovsxbw_2 =	"rro:660F3820rM|rx/oq:",
1438  pmovsxdq_2 =	"rro:660F3825rM|rx/oq:",
1439  pmovsxwd_2 =	"rro:660F3823rM|rx/oq:",
1440  pmovsxwq_2 =	"rro:660F3824rM|rx/od:",
1441  pmovzxbd_2 =	"rro:660F3831rM|rx/od:",
1442  pmovzxbq_2 =	"rro:660F3832rM|rx/ow:",
1443  pmovzxbw_2 =	"rro:660F3830rM|rx/oq:",
1444  pmovzxdq_2 =	"rro:660F3835rM|rx/oq:",
1445  pmovzxwd_2 =	"rro:660F3833rM|rx/oq:",
1446  pmovzxwq_2 =	"rro:660F3834rM|rx/od:",
1447  pmuldq_2 =	"rmo:660F3828rM",
1448  pmulld_2 =	"rmo:660F3840rM",
1449  ptest_2 =	"rmo:660F3817rM",
1450  roundpd_3 =	"rmio:660F3A09rMU",
1451  roundps_3 =	"rmio:660F3A08rMU",
1452  roundsd_3 =	"rrio:660F3A0BrMU|rxi/oq:",
1453  roundss_3 =	"rrio:660F3A0ArMU|rxi/od:",
1454
1455  -- SSE4.2 ops
1456  crc32_2 =	"rmqd:F20F38F1rM|rm/dw:66F20F38F1rM|rm/db:F20F38F0rM|rm/qb:",
1457  pcmpestri_3 =	"rmio:660F3A61rMU",
1458  pcmpestrm_3 =	"rmio:660F3A60rMU",
1459  pcmpgtq_2 =	"rmo:660F3837rM",
1460  pcmpistri_3 =	"rmio:660F3A63rMU",
1461  pcmpistrm_3 =	"rmio:660F3A62rMU",
1462  popcnt_2 =	"rmqdw:F30FB8rM",
1463
1464  -- SSE4a
1465  extrq_2 =	"rro:660F79rM",
1466  extrq_3 =	"riio:660F780mUU",
1467  insertq_2 =	"rro:F20F79rM",
1468  insertq_4 =	"rriio:F20F78rMUU",
1469  lzcnt_2 =	"rmqdw:F30FBDrM",
1470  movntsd_2 =	"xr/qo:nF20F2BRm",
1471  movntss_2 =	"xr/do:F30F2BRm",
1472  -- popcnt is also in SSE4.2
1473
1474  -- AES-NI
1475  aesdec_2 =	"rmo:660F38DErM",
1476  aesdeclast_2 = "rmo:660F38DFrM",
1477  aesenc_2 =	"rmo:660F38DCrM",
1478  aesenclast_2 = "rmo:660F38DDrM",
1479  aesimc_2 =	"rmo:660F38DBrM",
1480  aeskeygenassist_3 = "rmio:660F3ADFrMU",
1481  pclmulqdq_3 =	"rmio:660F3A44rMU",
1482
1483   -- AVX FP ops
1484  vaddsubpd_3 =	"rrmoy:660FVD0rM",
1485  vaddsubps_3 =	"rrmoy:F20FVD0rM",
1486  vandpd_3 =	"rrmoy:660FV54rM",
1487  vandps_3 =	"rrmoy:0FV54rM",
1488  vandnpd_3 =	"rrmoy:660FV55rM",
1489  vandnps_3 =	"rrmoy:0FV55rM",
1490  vblendpd_4 =	"rrmioy:660F3AV0DrMU",
1491  vblendps_4 =	"rrmioy:660F3AV0CrMU",
1492  vblendvpd_4 =	"rrmroy:660F3AV4BrMs",
1493  vblendvps_4 =	"rrmroy:660F3AV4ArMs",
1494  vbroadcastf128_2 = "rx/yo:660F38u1ArM",
1495  vcmppd_4 =	"rrmioy:660FVC2rMU",
1496  vcmpps_4 =	"rrmioy:0FVC2rMU",
1497  vcmpsd_4 =	"rrrio:F20FVC2rMU|rrxi/ooq:",
1498  vcmpss_4 =	"rrrio:F30FVC2rMU|rrxi/ood:",
1499  vcomisd_2 =	"rro:660Fu2FrM|rx/oq:",
1500  vcomiss_2 =	"rro:0Fu2FrM|rx/od:",
1501  vcvtdq2pd_2 =	"rro:F30FuE6rM|rx/oq:|rm/yo:",
1502  vcvtdq2ps_2 =	"rmoy:0Fu5BrM",
1503  vcvtpd2dq_2 =	"rmoy:F20FuE6rM",
1504  vcvtpd2ps_2 =	"rmoy:660Fu5ArM",
1505  vcvtps2dq_2 =	"rmoy:660Fu5BrM",
1506  vcvtps2pd_2 =	"rro:0Fu5ArM|rx/oq:|rm/yo:",
1507  vcvtsd2si_2 =	"rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
1508  vcvtsd2ss_3 =	"rrro:F20FV5ArM|rrx/ooq:",
1509  vcvtsi2sd_3 =	"rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
1510  vcvtsi2ss_3 =	"rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
1511  vcvtss2sd_3 =	"rrro:F30FV5ArM|rrx/ood:",
1512  vcvtss2si_2 =	"rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
1513  vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
1514  vcvttps2dq_2 = "rmoy:F30Fu5BrM",
1515  vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
1516  vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
1517  vdppd_4 =	"rrmio:660F3AV41rMU",
1518  vdpps_4 =	"rrmioy:660F3AV40rMU",
1519  vextractf128_3 = "mri/oy:660F3AuL19RmU",
1520  vextractps_3 = "mri/do:660F3Au17RmU",
1521  vhaddpd_3 =	"rrmoy:660FV7CrM",
1522  vhaddps_3 =	"rrmoy:F20FV7CrM",
1523  vhsubpd_3 =	"rrmoy:660FV7DrM",
1524  vhsubps_3 =	"rrmoy:F20FV7DrM",
1525  vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
1526  vinsertps_4 =	"rrrio:660F3AV21rMU|rrxi/ood:",
1527  vldmxcsr_1 =	"xd:0FuAE2m",
1528  vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
1529  vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
1530  vmovapd_2 =	"rmoy:660Fu28rM|mroy:660Fu29Rm",
1531  vmovaps_2 =	"rmoy:0Fu28rM|mroy:0Fu29Rm",
1532  vmovd_2 =	"rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
1533  vmovq_2 =	"rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
1534  vmovddup_2 =	"rmy:F20Fu12rM|rro:|rx/oq:",
1535  vmovhlps_3 =	"rrro:0FV12rM",
1536  vmovhpd_2 =	"xr/qo:660Fu17Rm",
1537  vmovhpd_3 =	"rrx/ooq:660FV16rM",
1538  vmovhps_2 =	"xr/qo:0Fu17Rm",
1539  vmovhps_3 =	"rrx/ooq:0FV16rM",
1540  vmovlhps_3 =	"rrro:0FV16rM",
1541  vmovlpd_2 =	"xr/qo:660Fu13Rm",
1542  vmovlpd_3 =	"rrx/ooq:660FV12rM",
1543  vmovlps_2 =	"xr/qo:0Fu13Rm",
1544  vmovlps_3 =	"rrx/ooq:0FV12rM",
1545  vmovmskpd_2 =	"rr/do:660Fu50rM|rr/dy:660FuL50rM",
1546  vmovmskps_2 =	"rr/do:0Fu50rM|rr/dy:0FuL50rM",
1547  vmovntpd_2 =	"xroy:660Fu2BRm",
1548  vmovntps_2 =	"xroy:0Fu2BRm",
1549  vmovsd_2 =	"rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
1550  vmovsd_3 =	"rrro:F20FV10rM",
1551  vmovshdup_2 =	"rmoy:F30Fu16rM",
1552  vmovsldup_2 =	"rmoy:F30Fu12rM",
1553  vmovss_2 =	"rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
1554  vmovss_3 =	"rrro:F30FV10rM",
1555  vmovupd_2 =	"rmoy:660Fu10rM|mroy:660Fu11Rm",
1556  vmovups_2 =	"rmoy:0Fu10rM|mroy:0Fu11Rm",
1557  vorpd_3 =	"rrmoy:660FV56rM",
1558  vorps_3 =	"rrmoy:0FV56rM",
1559  vpermilpd_3 =	"rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
1560  vpermilps_3 =	"rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
1561  vperm2f128_4 = "rrmiy:660F3AV06rMU",
1562  vptestpd_2 =	"rmoy:660F38u0FrM",
1563  vptestps_2 =	"rmoy:660F38u0ErM",
1564  vrcpps_2 =	"rmoy:0Fu53rM",
1565  vrcpss_3 =	"rrro:F30FV53rM|rrx/ood:",
1566  vrsqrtps_2 =	"rmoy:0Fu52rM",
1567  vrsqrtss_3 =	"rrro:F30FV52rM|rrx/ood:",
1568  vroundpd_3 =	"rmioy:660F3Au09rMU",
1569  vroundps_3 =	"rmioy:660F3Au08rMU",
1570  vroundsd_4 =	"rrrio:660F3AV0BrMU|rrxi/ooq:",
1571  vroundss_4 =	"rrrio:660F3AV0ArMU|rrxi/ood:",
1572  vshufpd_4 =	"rrmioy:660FVC6rMU",
1573  vshufps_4 =	"rrmioy:0FVC6rMU",
1574  vsqrtps_2 =	"rmoy:0Fu51rM",
1575  vsqrtss_2 =	"rro:F30Fu51rM|rx/od:",
1576  vsqrtpd_2 =	"rmoy:660Fu51rM",
1577  vsqrtsd_2 =	"rro:F20Fu51rM|rx/oq:",
1578  vstmxcsr_1 =	"xd:0FuAE3m",
1579  vucomisd_2 =	"rro:660Fu2ErM|rx/oq:",
1580  vucomiss_2 =	"rro:0Fu2ErM|rx/od:",
1581  vunpckhpd_3 =	"rrmoy:660FV15rM",
1582  vunpckhps_3 =	"rrmoy:0FV15rM",
1583  vunpcklpd_3 =	"rrmoy:660FV14rM",
1584  vunpcklps_3 =	"rrmoy:0FV14rM",
1585  vxorpd_3 =	"rrmoy:660FV57rM",
1586  vxorps_3 =	"rrmoy:0FV57rM",
1587  vzeroall_0 =	"0FuL77",
1588  vzeroupper_0 = "0Fu77",
1589
1590  -- AVX2 FP ops
1591  vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
1592  vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
1593  -- *vgather* (!vsib)
1594  vpermpd_3 =	"rmiy:660F3AuX01rMU",
1595  vpermps_3 =	"rrmy:660F38V16rM",
1596
1597  -- AVX, AVX2 integer ops
1598  -- In general, xmm requires AVX, ymm requires AVX2.
1599  vaesdec_3 =  "rrmo:660F38VDErM",
1600  vaesdeclast_3 = "rrmo:660F38VDFrM",
1601  vaesenc_3 =  "rrmo:660F38VDCrM",
1602  vaesenclast_3 = "rrmo:660F38VDDrM",
1603  vaesimc_2 =  "rmo:660F38uDBrM",
1604  vaeskeygenassist_3 = "rmio:660F3AuDFrMU",
1605  vlddqu_2 =	"rxoy:F20FuF0rM",
1606  vmaskmovdqu_2 = "rro:660FuF7rM",
1607  vmovdqa_2 =	"rmoy:660Fu6FrM|mroy:660Fu7FRm",
1608  vmovdqu_2 =	"rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
1609  vmovntdq_2 =	"xroy:660FuE7Rm",
1610  vmovntdqa_2 =	"rxoy:660F38u2ArM",
1611  vmpsadbw_4 =	"rrmioy:660F3AV42rMU",
1612  vpabsb_2 =	"rmoy:660F38u1CrM",
1613  vpabsd_2 =	"rmoy:660F38u1ErM",
1614  vpabsw_2 =	"rmoy:660F38u1DrM",
1615  vpackusdw_3 =	"rrmoy:660F38V2BrM",
1616  vpalignr_4 =	"rrmioy:660F3AV0FrMU",
1617  vpblendvb_4 =	"rrmroy:660F3AV4CrMs",
1618  vpblendw_4 =	"rrmioy:660F3AV0ErMU",
1619  vpclmulqdq_4 = "rrmio:660F3AV44rMU",
1620  vpcmpeqq_3 =	"rrmoy:660F38V29rM",
1621  vpcmpestri_3 = "rmio:660F3Au61rMU",
1622  vpcmpestrm_3 = "rmio:660F3Au60rMU",
1623  vpcmpgtq_3 =	"rrmoy:660F38V37rM",
1624  vpcmpistri_3 = "rmio:660F3Au63rMU",
1625  vpcmpistrm_3 = "rmio:660F3Au62rMU",
1626  vpextrb_3 =	"rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
1627  vpextrw_3 =	"rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
1628  vpextrd_3 =	"mri/do:660F3Au16RmU",
1629  vpextrq_3 =	"mri/qo:660F3Au16RmU",
1630  vphaddw_3 =	"rrmoy:660F38V01rM",
1631  vphaddd_3 =	"rrmoy:660F38V02rM",
1632  vphaddsw_3 =	"rrmoy:660F38V03rM",
1633  vphminposuw_2 = "rmo:660F38u41rM",
1634  vphsubw_3 =	"rrmoy:660F38V05rM",
1635  vphsubd_3 =	"rrmoy:660F38V06rM",
1636  vphsubsw_3 =	"rrmoy:660F38V07rM",
1637  vpinsrb_4 =	"rrri/ood:660F3AV20rMU|rrxi/oob:",
1638  vpinsrw_4 =	"rrri/ood:660FVC4rMU|rrxi/oow:",
1639  vpinsrd_4 =	"rrmi/ood:660F3AV22rMU",
1640  vpinsrq_4 =	"rrmi/ooq:660F3AVX22rMU",
1641  vpmaddubsw_3 = "rrmoy:660F38V04rM",
1642  vpmaxsb_3 =	"rrmoy:660F38V3CrM",
1643  vpmaxsd_3 =	"rrmoy:660F38V3DrM",
1644  vpmaxuw_3 =	"rrmoy:660F38V3ErM",
1645  vpmaxud_3 =	"rrmoy:660F38V3FrM",
1646  vpminsb_3 =	"rrmoy:660F38V38rM",
1647  vpminsd_3 =	"rrmoy:660F38V39rM",
1648  vpminuw_3 =	"rrmoy:660F38V3ArM",
1649  vpminud_3 =	"rrmoy:660F38V3BrM",
1650  vpmovmskb_2 =	"rr/do:660FuD7rM|rr/dy:660FuLD7rM",
1651  vpmovsxbw_2 =	"rroy:660F38u20rM|rx/oq:|rx/yo:",
1652  vpmovsxbd_2 =	"rroy:660F38u21rM|rx/od:|rx/yq:",
1653  vpmovsxbq_2 =	"rroy:660F38u22rM|rx/ow:|rx/yd:",
1654  vpmovsxwd_2 =	"rroy:660F38u23rM|rx/oq:|rx/yo:",
1655  vpmovsxwq_2 =	"rroy:660F38u24rM|rx/od:|rx/yq:",
1656  vpmovsxdq_2 =	"rroy:660F38u25rM|rx/oq:|rx/yo:",
1657  vpmovzxbw_2 =	"rroy:660F38u30rM|rx/oq:|rx/yo:",
1658  vpmovzxbd_2 =	"rroy:660F38u31rM|rx/od:|rx/yq:",
1659  vpmovzxbq_2 =	"rroy:660F38u32rM|rx/ow:|rx/yd:",
1660  vpmovzxwd_2 =	"rroy:660F38u33rM|rx/oq:|rx/yo:",
1661  vpmovzxwq_2 =	"rroy:660F38u34rM|rx/od:|rx/yq:",
1662  vpmovzxdq_2 =	"rroy:660F38u35rM|rx/oq:|rx/yo:",
1663  vpmuldq_3 =	"rrmoy:660F38V28rM",
1664  vpmulhrsw_3 =	"rrmoy:660F38V0BrM",
1665  vpmulld_3 =	"rrmoy:660F38V40rM",
1666  vpshufb_3 =	"rrmoy:660F38V00rM",
1667  vpshufd_3 =	"rmioy:660Fu70rMU",
1668  vpshufhw_3 =	"rmioy:F30Fu70rMU",
1669  vpshuflw_3 =	"rmioy:F20Fu70rMU",
1670  vpsignb_3 =	"rrmoy:660F38V08rM",
1671  vpsignw_3 =	"rrmoy:660F38V09rM",
1672  vpsignd_3 =	"rrmoy:660F38V0ArM",
1673  vpslldq_3 =	"rrioy:660Fv737mU",
1674  vpsllw_3 =	"rrmoy:660FVF1rM|rrioy:660Fv716mU",
1675  vpslld_3 =	"rrmoy:660FVF2rM|rrioy:660Fv726mU",
1676  vpsllq_3 =	"rrmoy:660FVF3rM|rrioy:660Fv736mU",
1677  vpsraw_3 =	"rrmoy:660FVE1rM|rrioy:660Fv714mU",
1678  vpsrad_3 =	"rrmoy:660FVE2rM|rrioy:660Fv724mU",
1679  vpsrldq_3 =	"rrioy:660Fv733mU",
1680  vpsrlw_3 =	"rrmoy:660FVD1rM|rrioy:660Fv712mU",
1681  vpsrld_3 =	"rrmoy:660FVD2rM|rrioy:660Fv722mU",
1682  vpsrlq_3 =	"rrmoy:660FVD3rM|rrioy:660Fv732mU",
1683  vptest_2 =	"rmoy:660F38u17rM",
1684
1685  -- AVX2 integer ops
1686  vbroadcasti128_2 = "rx/yo:660F38u5ArM",
1687  vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
1688  vextracti128_3 = "mri/oy:660F3AuL39RmU",
1689  vpblendd_4 =	"rrmioy:660F3AV02rMU",
1690  vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
1691  vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
1692  vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
1693  vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
1694  vpermd_3 =	"rrmy:660F38V36rM",
1695  vpermq_3 =	"rmiy:660F3AuX00rMU",
1696  -- *vpgather* (!vsib)
1697  vperm2i128_4 = "rrmiy:660F3AV46rMU",
1698  vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
1699  vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
1700  vpsllvd_3 =	"rrmoy:660F38V47rM",
1701  vpsllvq_3 =	"rrmoy:660F38VX47rM",
1702  vpsravd_3 =	"rrmoy:660F38V46rM",
1703  vpsrlvd_3 =	"rrmoy:660F38V45rM",
1704  vpsrlvq_3 =	"rrmoy:660F38VX45rM",
1705
1706  -- Intel ADX
1707  adcx_2 =	"rmqd:660F38F6rM",
1708  adox_2 =	"rmqd:F30F38F6rM",
1709
1710  -- BMI1
1711  andn_3 =	"rrmqd:0F38VF2rM",
1712  bextr_3 =	"rmrqd:0F38wF7rM",
1713  blsi_2 =	"rmqd:0F38vF33m",
1714  blsmsk_2 =	"rmqd:0F38vF32m",
1715  blsr_2 =	"rmqd:0F38vF31m",
1716  tzcnt_2 =	"rmqdw:F30FBCrM",
1717
1718  -- BMI2
1719  bzhi_3 =	"rmrqd:0F38wF5rM",
1720  mulx_3 =	"rrmqd:F20F38VF6rM",
1721  pdep_3 =	"rrmqd:F20F38VF5rM",
1722  pext_3 =	"rrmqd:F30F38VF5rM",
1723  rorx_3 =	"rmSqd:F20F3AuF0rMS",
1724  sarx_3 =	"rmrqd:F30F38wF7rM",
1725  shrx_3 =	"rmrqd:F20F38wF7rM",
1726  shlx_3 =	"rmrqd:660F38wF7rM",
1727
1728  -- FMA3
1729  vfmaddsub132pd_3 = "rrmoy:660F38VX96rM",
1730  vfmaddsub132ps_3 = "rrmoy:660F38V96rM",
1731  vfmaddsub213pd_3 = "rrmoy:660F38VXA6rM",
1732  vfmaddsub213ps_3 = "rrmoy:660F38VA6rM",
1733  vfmaddsub231pd_3 = "rrmoy:660F38VXB6rM",
1734  vfmaddsub231ps_3 = "rrmoy:660F38VB6rM",
1735
1736  vfmsubadd132pd_3 = "rrmoy:660F38VX97rM",
1737  vfmsubadd132ps_3 = "rrmoy:660F38V97rM",
1738  vfmsubadd213pd_3 = "rrmoy:660F38VXA7rM",
1739  vfmsubadd213ps_3 = "rrmoy:660F38VA7rM",
1740  vfmsubadd231pd_3 = "rrmoy:660F38VXB7rM",
1741  vfmsubadd231ps_3 = "rrmoy:660F38VB7rM",
1742
1743  vfmadd132pd_3 = "rrmoy:660F38VX98rM",
1744  vfmadd132ps_3 = "rrmoy:660F38V98rM",
1745  vfmadd132sd_3 = "rrro:660F38VX99rM|rrx/ooq:",
1746  vfmadd132ss_3 = "rrro:660F38V99rM|rrx/ood:",
1747  vfmadd213pd_3 = "rrmoy:660F38VXA8rM",
1748  vfmadd213ps_3 = "rrmoy:660F38VA8rM",
1749  vfmadd213sd_3 = "rrro:660F38VXA9rM|rrx/ooq:",
1750  vfmadd213ss_3 = "rrro:660F38VA9rM|rrx/ood:",
1751  vfmadd231pd_3 = "rrmoy:660F38VXB8rM",
1752  vfmadd231ps_3 = "rrmoy:660F38VB8rM",
1753  vfmadd231sd_3 = "rrro:660F38VXB9rM|rrx/ooq:",
1754  vfmadd231ss_3 = "rrro:660F38VB9rM|rrx/ood:",
1755
1756  vfmsub132pd_3 = "rrmoy:660F38VX9ArM",
1757  vfmsub132ps_3 = "rrmoy:660F38V9ArM",
1758  vfmsub132sd_3 = "rrro:660F38VX9BrM|rrx/ooq:",
1759  vfmsub132ss_3 = "rrro:660F38V9BrM|rrx/ood:",
1760  vfmsub213pd_3 = "rrmoy:660F38VXAArM",
1761  vfmsub213ps_3 = "rrmoy:660F38VAArM",
1762  vfmsub213sd_3 = "rrro:660F38VXABrM|rrx/ooq:",
1763  vfmsub213ss_3 = "rrro:660F38VABrM|rrx/ood:",
1764  vfmsub231pd_3 = "rrmoy:660F38VXBArM",
1765  vfmsub231ps_3 = "rrmoy:660F38VBArM",
1766  vfmsub231sd_3 = "rrro:660F38VXBBrM|rrx/ooq:",
1767  vfmsub231ss_3 = "rrro:660F38VBBrM|rrx/ood:",
1768
1769  vfnmadd132pd_3 = "rrmoy:660F38VX9CrM",
1770  vfnmadd132ps_3 = "rrmoy:660F38V9CrM",
1771  vfnmadd132sd_3 = "rrro:660F38VX9DrM|rrx/ooq:",
1772  vfnmadd132ss_3 = "rrro:660F38V9DrM|rrx/ood:",
1773  vfnmadd213pd_3 = "rrmoy:660F38VXACrM",
1774  vfnmadd213ps_3 = "rrmoy:660F38VACrM",
1775  vfnmadd213sd_3 = "rrro:660F38VXADrM|rrx/ooq:",
1776  vfnmadd213ss_3 = "rrro:660F38VADrM|rrx/ood:",
1777  vfnmadd231pd_3 = "rrmoy:660F38VXBCrM",
1778  vfnmadd231ps_3 = "rrmoy:660F38VBCrM",
1779  vfnmadd231sd_3 = "rrro:660F38VXBDrM|rrx/ooq:",
1780  vfnmadd231ss_3 = "rrro:660F38VBDrM|rrx/ood:",
1781
1782  vfnmsub132pd_3 = "rrmoy:660F38VX9ErM",
1783  vfnmsub132ps_3 = "rrmoy:660F38V9ErM",
1784  vfnmsub132sd_3 = "rrro:660F38VX9FrM|rrx/ooq:",
1785  vfnmsub132ss_3 = "rrro:660F38V9FrM|rrx/ood:",
1786  vfnmsub213pd_3 = "rrmoy:660F38VXAErM",
1787  vfnmsub213ps_3 = "rrmoy:660F38VAErM",
1788  vfnmsub213sd_3 = "rrro:660F38VXAFrM|rrx/ooq:",
1789  vfnmsub213ss_3 = "rrro:660F38VAFrM|rrx/ood:",
1790  vfnmsub231pd_3 = "rrmoy:660F38VXBErM",
1791  vfnmsub231ps_3 = "rrmoy:660F38VBErM",
1792  vfnmsub231sd_3 = "rrro:660F38VXBFrM|rrx/ooq:",
1793  vfnmsub231ss_3 = "rrro:660F38VBFrM|rrx/ood:",
1794}
1795
1796------------------------------------------------------------------------------
1797
1798-- Arithmetic ops.
1799for name,n in pairs{ add = 0, ["or"] = 1, adc = 2, sbb = 3,
1800		     ["and"] = 4, sub = 5, xor = 6, cmp = 7 } do
1801  local n8 = shl(n, 3)
1802  map_op[name.."_2"] = format(
1803    "mr:%02XRm|rm:%02XrM|mI1qdw:81%XmI|mS1qdw:83%XmS|Ri1qdwb:%02Xri|mi1qdwb:81%Xmi",
1804    1+n8, 3+n8, n, n, 5+n8, n)
1805end
1806
1807-- Shift ops.
1808for name,n in pairs{ rol = 0, ror = 1, rcl = 2, rcr = 3,
1809		     shl = 4, shr = 5,          sar = 7, sal = 4 } do
1810  map_op[name.."_2"] = format("m1:D1%Xm|mC1qdwb:D3%Xm|mi:C1%XmU", n, n, n)
1811end
1812
1813-- Conditional ops.
1814for cc,n in pairs(map_cc) do
1815  map_op["j"..cc.."_1"] = format("J.:n0F8%XJ", n) -- short: 7%X
1816  map_op["set"..cc.."_1"] = format("mb:n0F9%X2m", n)
1817  map_op["cmov"..cc.."_2"] = format("rmqdw:0F4%XrM", n) -- P6+
1818end
1819
1820-- FP arithmetic ops.
1821for name,n in pairs{ add = 0, mul = 1, com = 2, comp = 3,
1822		     sub = 4, subr = 5, div = 6, divr = 7 } do
1823  local nc = 0xc0 + shl(n, 3)
1824  local nr = nc + (n < 4 and 0 or (n % 2 == 0 and 8 or -8))
1825  local fn = "f"..name
1826  map_op[fn.."_1"] = format("ff:D8%02Xr|xd:D8%Xm|xq:nDC%Xm", nc, n, n)
1827  if n == 2 or n == 3 then
1828    map_op[fn.."_2"] = format("Fff:D8%02XR|Fx2d:D8%XM|Fx2q:nDC%XM", nc, n, n)
1829  else
1830    map_op[fn.."_2"] = format("Fff:D8%02XR|fFf:DC%02Xr|Fx2d:D8%XM|Fx2q:nDC%XM", nc, nr, n, n)
1831    map_op[fn.."p_1"] = format("ff:DE%02Xr", nr)
1832    map_op[fn.."p_2"] = format("fFf:DE%02Xr", nr)
1833  end
1834  map_op["fi"..name.."_1"] = format("xd:DA%Xm|xw:nDE%Xm", n, n)
1835end
1836
1837-- FP conditional moves.
1838for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
1839  local nc = 0xdac0 + shl(band(n, 3), 3) + shl(band(n, 4), 6)
1840  map_op["fcmov"..cc.."_1"] = format("ff:%04Xr", nc) -- P6+
1841  map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
1842end
1843
1844-- SSE / AVX FP arithmetic ops.
1845for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
1846		     sub = 12, min = 13, div = 14, max = 15 } do
1847  map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
1848  map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
1849  map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
1850  map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
1851  if n ~= 1 then
1852    map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
1853    map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
1854    map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
1855    map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
1856  end
1857end
1858
1859-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
1860for name,n in pairs{
1861  paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
1862  paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
1863  packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
1864  paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
1865  pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
1866  pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
1867  pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
1868  pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
1869  pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
1870  pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
1871  psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
1872  psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
1873  punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
1874  punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
1875  punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
1876} do
1877  map_op[name.."_2"] = format("rmo:660F%02XrM", n)
1878  map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
1879end
1880
1881------------------------------------------------------------------------------
1882
1883local map_vexarg = { u = false, v = 1, V = 2, w = 3 }
1884
1885-- Process pattern string.
1886local function dopattern(pat, args, sz, op, needrex)
1887  local digit, addin, vex
1888  local opcode = 0
1889  local szov = sz
1890  local narg = 1
1891  local rex = 0
1892
1893  -- Limit number of section buffer positions used by a single dasm_put().
1894  -- A single opcode needs a maximum of 6 positions.
1895  if secpos+6 > maxsecpos then wflush() end
1896
1897  -- Process each character.
1898  for c in gmatch(pat.."|", ".") do
1899    if match(c, "%x") then	-- Hex digit.
1900      digit = byte(c) - 48
1901      if digit > 48 then digit = digit - 39
1902      elseif digit > 16 then digit = digit - 7 end
1903      opcode = opcode*16 + digit
1904      addin = nil
1905    elseif c == "n" then	-- Disable operand size mods for opcode.
1906      szov = nil
1907    elseif c == "X" then	-- Force REX.W.
1908      rex = 8
1909    elseif c == "L" then	-- Force VEX.L.
1910      vex.l = true
1911    elseif c == "r" then	-- Merge 1st operand regno. into opcode.
1912      addin = args[1]; opcode = opcode + (addin.reg % 8)
1913      if narg < 2 then narg = 2 end
1914    elseif c == "R" then	-- Merge 2nd operand regno. into opcode.
1915      addin = args[2]; opcode = opcode + (addin.reg % 8)
1916      narg = 3
1917    elseif c == "m" or c == "M" then	-- Encode ModRM/SIB.
1918      local s
1919      if addin then
1920	s = addin.reg
1921	opcode = opcode - band(s, 7)	-- Undo regno opcode merge.
1922      else
1923	s = band(opcode, 15)	-- Undo last digit.
1924	opcode = shr(opcode, 4)
1925      end
1926      local nn = c == "m" and 1 or 2
1927      local t = args[nn]
1928      if narg <= nn then narg = nn + 1 end
1929      if szov == "q" and rex == 0 then rex = rex + 8 end
1930      if t.reg and t.reg > 7 then rex = rex + 1 end
1931      if t.xreg and t.xreg > 7 then rex = rex + 2 end
1932      if s > 7 then rex = rex + 4 end
1933      if needrex then rex = rex + 16 end
1934      local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg)
1935      opcode = nil
1936      local imark = sub(pat, -1) -- Force a mark (ugly).
1937      -- Put ModRM/SIB with regno/last digit as spare.
1938      wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk)
1939      addin = nil
1940    elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
1941      local b = band(opcode, 255); opcode = shr(opcode, 8)
1942      local m = 1
1943      if b == 0x38 then m = 2
1944      elseif b == 0x3a then m = 3 end
1945      if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
1946      if b ~= 0x0f then
1947	werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
1948	  "' in pattern `"..pat.."' for `"..op.."'")
1949      end
1950      local v = map_vexarg[c]
1951      if v then v = remove(args, v) end
1952      b = band(opcode, 255)
1953      local p = 0
1954      if b == 0x66 then p = 1
1955      elseif b == 0xf3 then p = 2
1956      elseif b == 0xf2 then p = 3 end
1957      if p ~= 0 then opcode = shr(opcode, 8) end
1958      if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
1959      vex = { m = m, p = p, v = v }
1960    else
1961      if opcode then -- Flush opcode.
1962	if szov == "q" and rex == 0 then rex = rex + 8 end
1963	if needrex then rex = rex + 16 end
1964	if addin and addin.reg == -1 then
1965	  local psz, sk = wputop(szov, opcode - 7, rex, vex, true)
1966	  wvreg("opcode", addin.vreg, psz, sk)
1967	else
1968	  if addin and addin.reg > 7 then rex = rex + 1 end
1969	  wputop(szov, opcode, rex, vex)
1970	end
1971	opcode = nil
1972      end
1973      if c == "|" then break end
1974      if c == "o" then -- Offset (pure 32 bit displacement).
1975	wputdarg(args[1].disp); if narg < 2 then narg = 2 end
1976      elseif c == "O" then
1977	wputdarg(args[2].disp); narg = 3
1978      else
1979	-- Anything else is an immediate operand.
1980	local a = args[narg]
1981	narg = narg + 1
1982	local mode, imm = a.mode, a.imm
1983	if mode == "iJ" and not match(x64 and "J" or "iIJ", c) then
1984	  werror("bad operand size for label")
1985	end
1986	if c == "S" then
1987	  wputsbarg(imm)
1988	elseif c == "U" then
1989	  wputbarg(imm)
1990	elseif c == "W" then
1991	  wputwarg(imm)
1992	elseif c == "i" or c == "I" then
1993	  if mode == "iJ" then
1994	    wputlabel("IMM_", imm, 1)
1995	  elseif mode == "iI" and c == "I" then
1996	    waction(sz == "w" and "IMM_WB" or "IMM_DB", imm)
1997	  else
1998	    wputszarg(sz, imm)
1999	  end
2000	elseif c == "J" then
2001	  if mode == "iPJ" then
2002	    waction("REL_A", imm) -- !x64 (secpos)
2003	  else
2004	    wputlabel("REL_", imm, 2)
2005	  end
2006	elseif c == "s" then
2007	  local reg = a.reg
2008	  if reg < 0 then
2009	    wputb(0)
2010	    wvreg("imm.hi", a.vreg)
2011	  else
2012	    wputb(shl(reg, 4))
2013	  end
2014	else
2015	  werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
2016	end
2017      end
2018    end
2019  end
2020end
2021
2022------------------------------------------------------------------------------
2023
2024-- Mapping of operand modes to short names. Suppress output with '#'.
2025local map_modename = {
2026  r = "reg", R = "eax", C = "cl", x = "mem", m = "mrm", i = "imm",
2027  f = "stx", F = "st0", J = "lbl", ["1"] = "1",
2028  I = "#", S = "#", O = "#",
2029}
2030
2031-- Return a table/string showing all possible operand modes.
2032local function templatehelp(template, nparams)
2033  if nparams == 0 then return "" end
2034  local t = {}
2035  for tm in gmatch(template, "[^%|]+") do
2036    local s = map_modename[sub(tm, 1, 1)]
2037    s = s..gsub(sub(tm, 2, nparams), ".", function(c)
2038      return ", "..map_modename[c]
2039    end)
2040    if not match(s, "#") then t[#t+1] = s end
2041  end
2042  return t
2043end
2044
2045-- Match operand modes against mode match part of template.
2046local function matchtm(tm, args)
2047  for i=1,#args do
2048    if not match(args[i].mode, sub(tm, i, i)) then return end
2049  end
2050  return true
2051end
2052
2053-- Handle opcodes defined with template strings.
2054map_op[".template__"] = function(params, template, nparams)
2055  if not params then return templatehelp(template, nparams) end
2056  local args = {}
2057
2058  -- Zero-operand opcodes have no match part.
2059  if #params == 0 then
2060    dopattern(template, args, "d", params.op, nil)
2061    return
2062  end
2063
2064  -- Determine common operand size (coerce undefined size) or flag as mixed.
2065  local sz, szmix, needrex
2066  for i,p in ipairs(params) do
2067    args[i] = parseoperand(p)
2068    local nsz = args[i].opsize
2069    if nsz then
2070      if sz and sz ~= nsz then szmix = true else sz = nsz end
2071    end
2072    local nrex = args[i].needrex
2073    if nrex ~= nil then
2074      if needrex == nil then
2075	needrex = nrex
2076      elseif needrex ~= nrex then
2077	werror("bad mix of byte-addressable registers")
2078      end
2079    end
2080  end
2081
2082  -- Try all match:pattern pairs (separated by '|').
2083  local gotmatch, lastpat
2084  for tm in gmatch(template, "[^%|]+") do
2085    -- Split off size match (starts after mode match) and pattern string.
2086    local szm, pat = match(tm, "^(.-):(.*)$", #args+1)
2087    if pat == "" then pat = lastpat else lastpat = pat end
2088    if matchtm(tm, args) then
2089      local prefix = sub(szm, 1, 1)
2090      if prefix == "/" then -- Exactly match leading operand sizes.
2091	for i = #szm,1,-1 do
2092	  if i == 1 then
2093	    dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
2094	    return
2095	  elseif args[i-1].opsize ~= sub(szm, i, i) then
2096	    break
2097	  end
2098	end
2099      else -- Match common operand size.
2100	local szp = sz
2101	if szm == "" then szm = x64 and "qdwb" or "dwb" end -- Default sizes.
2102	if prefix == "1" then szp = args[1].opsize; szmix = nil
2103	elseif prefix == "2" then szp = args[2].opsize; szmix = nil end
2104	if not szmix and (prefix == "." or match(szm, szp or "#")) then
2105	  dopattern(pat, args, szp, params.op, needrex) -- Process pattern.
2106	  return
2107	end
2108      end
2109      gotmatch = true
2110    end
2111  end
2112
2113  local msg = "bad operand mode"
2114  if gotmatch then
2115    if szmix then
2116      msg = "mixed operand size"
2117    else
2118      msg = sz and "bad operand size" or "missing operand size"
2119    end
2120  end
2121
2122  werror(msg.." in `"..opmodestr(params.op, args).."'")
2123end
2124
2125------------------------------------------------------------------------------
2126
2127-- x64-specific opcode for 64 bit immediates and displacements.
2128if x64 then
2129  function map_op.mov64_2(params)
2130    if not params then return { "reg, imm", "reg, [disp]", "[disp], reg" } end
2131    if secpos+2 > maxsecpos then wflush() end
2132    local opcode, op64, sz, rex, vreg
2133    local op64 = match(params[1], "^%[%s*(.-)%s*%]$")
2134    if op64 then
2135      local a = parseoperand(params[2])
2136      if a.mode ~= "rmR" then werror("bad operand mode") end
2137      sz = a.opsize
2138      rex = sz == "q" and 8 or 0
2139      opcode = 0xa3
2140    else
2141      op64 = match(params[2], "^%[%s*(.-)%s*%]$")
2142      local a = parseoperand(params[1])
2143      if op64 then
2144	if a.mode ~= "rmR" then werror("bad operand mode") end
2145	sz = a.opsize
2146	rex = sz == "q" and 8 or 0
2147	opcode = 0xa1
2148      else
2149	if sub(a.mode, 1, 1) ~= "r" or a.opsize ~= "q" then
2150	  werror("bad operand mode")
2151	end
2152	op64 = params[2]
2153	if a.reg == -1 then
2154	  vreg = a.vreg
2155	  opcode = 0xb8
2156	else
2157	  opcode = 0xb8 + band(a.reg, 7)
2158	end
2159	rex = a.reg > 7 and 9 or 8
2160      end
2161    end
2162    local psz, sk = wputop(sz, opcode, rex, nil, vreg)
2163    wvreg("opcode", vreg, psz, sk)
2164    waction("IMM_D", format("(unsigned int)(%s)", op64))
2165    waction("IMM_D", format("(unsigned int)((%s)>>32)", op64))
2166  end
2167end
2168
2169------------------------------------------------------------------------------
2170
2171-- Pseudo-opcodes for data storage.
2172local function op_data(params)
2173  if not params then return "imm..." end
2174  local sz = sub(params.op, 2, 2)
2175  if sz == "l" then sz = "d" elseif sz == "a" then sz = addrsize end
2176  for _,p in ipairs(params) do
2177    local a = parseoperand(p, sz == "q")
2178    if sub(a.mode, 1, 1) ~= "i" or (a.opsize and a.opsize ~= sz) then
2179      werror("bad mode or size in `"..p.."'")
2180    end
2181    if a.mode == "iJ" then
2182      wputlabel("IMM_", a.imm, 1)
2183    elseif sz == "q" then
2184      wputqarg(a.imm)
2185    else
2186      wputszarg(sz, a.imm)
2187    end
2188    if secpos+2 > maxsecpos then wflush() end
2189  end
2190end
2191
2192map_op[".byte_*"] = op_data
2193map_op[".sbyte_*"] = op_data
2194map_op[".word_*"] = op_data
2195map_op[".dword_*"] = op_data
2196map_op[".qword_*"] = op_data
2197map_op[".aword_*"] = op_data
2198map_op[".long_*"] = op_data
2199map_op[".quad_*"] = op_data
2200map_op[".addr_*"] = op_data
2201
2202------------------------------------------------------------------------------
2203
2204-- Pseudo-opcode to mark the position where the action list is to be emitted.
2205map_op[".actionlist_1"] = function(params)
2206  if not params then return "cvar" end
2207  local name = params[1] -- No syntax check. You get to keep the pieces.
2208  wline(function(out) writeactions(out, name) end)
2209end
2210
2211-- Pseudo-opcode to mark the position where the global enum is to be emitted.
2212map_op[".globals_1"] = function(params)
2213  if not params then return "prefix" end
2214  local prefix = params[1] -- No syntax check. You get to keep the pieces.
2215  wline(function(out) writeglobals(out, prefix) end)
2216end
2217
2218-- Pseudo-opcode to mark the position where the global names are to be emitted.
2219map_op[".globalnames_1"] = function(params)
2220  if not params then return "cvar" end
2221  local name = params[1] -- No syntax check. You get to keep the pieces.
2222  wline(function(out) writeglobalnames(out, name) end)
2223end
2224
2225-- Pseudo-opcode to mark the position where the extern names are to be emitted.
2226map_op[".externnames_1"] = function(params)
2227  if not params then return "cvar" end
2228  local name = params[1] -- No syntax check. You get to keep the pieces.
2229  wline(function(out) writeexternnames(out, name) end)
2230end
2231
2232------------------------------------------------------------------------------
2233
2234-- Label pseudo-opcode (converted from trailing colon form).
2235map_op[".label_2"] = function(params)
2236  if not params then return "[1-9] | ->global | =>pcexpr  [, addr]" end
2237  if secpos+2 > maxsecpos then wflush() end
2238  local a = parseoperand(params[1])
2239  local mode, imm = a.mode, a.imm
2240  if type(imm) == "number" and (mode == "iJ" or (imm >= 1 and imm <= 9)) then
2241    -- Local label (1: ... 9:) or global label (->global:).
2242    waction("LABEL_LG", nil, 1)
2243    wputxb(imm)
2244  elseif mode == "iJ" then
2245    -- PC label (=>pcexpr:).
2246    waction("LABEL_PC", imm)
2247  else
2248    werror("bad label definition")
2249  end
2250  -- SETLABEL must immediately follow LABEL_LG/LABEL_PC.
2251  local addr = params[2]
2252  if addr then
2253    local a = parseoperand(addr)
2254    if a.mode == "iPJ" then
2255      waction("SETLABEL", a.imm)
2256    else
2257      werror("bad label assignment")
2258    end
2259  end
2260end
2261map_op[".label_1"] = map_op[".label_2"]
2262
2263------------------------------------------------------------------------------
2264
2265-- Alignment pseudo-opcode.
2266map_op[".align_1"] = function(params)
2267  if not params then return "numpow2" end
2268  if secpos+1 > maxsecpos then wflush() end
2269  local align = tonumber(params[1]) or map_opsizenum[map_opsize[params[1]]]
2270  if align then
2271    local x = align
2272    -- Must be a power of 2 in the range (2 ... 256).
2273    for i=1,8 do
2274      x = x / 2
2275      if x == 1 then
2276	waction("ALIGN", nil, 1)
2277	wputxb(align-1) -- Action byte is 2**n-1.
2278	return
2279      end
2280    end
2281  end
2282  werror("bad alignment")
2283end
2284
2285-- Spacing pseudo-opcode.
2286map_op[".space_2"] = function(params)
2287  if not params then return "num [, filler]" end
2288  if secpos+1 > maxsecpos then wflush() end
2289  waction("SPACE", params[1])
2290  local fill = params[2]
2291  if fill then
2292    fill = tonumber(fill)
2293    if not fill or fill < 0 or fill > 255 then werror("bad filler") end
2294  end
2295  wputxb(fill or 0)
2296end
2297map_op[".space_1"] = map_op[".space_2"]
2298
2299------------------------------------------------------------------------------
2300
2301-- Pseudo-opcode for (primitive) type definitions (map to C types).
2302map_op[".type_3"] = function(params, nparams)
2303  if not params then
2304    return nparams == 2 and "name, ctype" or "name, ctype, reg"
2305  end
2306  local name, ctype, reg = params[1], params[2], params[3]
2307  if not match(name, "^[%a_][%w_]*$") then
2308    werror("bad type name `"..name.."'")
2309  end
2310  local tp = map_type[name]
2311  if tp then
2312    werror("duplicate type `"..name.."'")
2313  end
2314  if reg and not map_reg_valid_base[reg] then
2315    werror("bad base register `"..(map_reg_rev[reg] or reg).."'")
2316  end
2317  -- Add #type to defines. A bit unclean to put it in map_archdef.
2318  map_archdef["#"..name] = "sizeof("..ctype..")"
2319  -- Add new type and emit shortcut define.
2320  local num = ctypenum + 1
2321  map_type[name] = {
2322    ctype = ctype,
2323    ctypefmt = format("Dt%X(%%s)", num),
2324    reg = reg,
2325  }
2326  wline(format("#define Dt%X(_V) (int)(ptrdiff_t)&(((%s *)0)_V)", num, ctype))
2327  ctypenum = num
2328end
2329map_op[".type_2"] = map_op[".type_3"]
2330
2331-- Dump type definitions.
2332local function dumptypes(out, lvl)
2333  local t = {}
2334  for name in pairs(map_type) do t[#t+1] = name end
2335  sort(t)
2336  out:write("Type definitions:\n")
2337  for _,name in ipairs(t) do
2338    local tp = map_type[name]
2339    local reg = tp.reg and map_reg_rev[tp.reg] or ""
2340    out:write(format("  %-20s %-20s %s\n", name, tp.ctype, reg))
2341  end
2342  out:write("\n")
2343end
2344
2345------------------------------------------------------------------------------
2346
2347-- Set the current section.
2348function _M.section(num)
2349  waction("SECTION")
2350  wputxb(num)
2351  wflush(true) -- SECTION is a terminal action.
2352end
2353
2354------------------------------------------------------------------------------
2355
2356-- Dump architecture description.
2357function _M.dumparch(out)
2358  out:write(format("DynASM %s version %s, released %s\n\n",
2359    _info.arch, _info.version, _info.release))
2360  dumpregs(out)
2361  dumpactions(out)
2362end
2363
2364-- Dump all user defined elements.
2365function _M.dumpdef(out, lvl)
2366  dumptypes(out, lvl)
2367  dumpglobals(out, lvl)
2368  dumpexterns(out, lvl)
2369end
2370
2371------------------------------------------------------------------------------
2372
2373-- Pass callbacks from/to the DynASM core.
2374function _M.passcb(wl, we, wf, ww)
2375  wline, werror, wfatal, wwarn = wl, we, wf, ww
2376  return wflush
2377end
2378
2379-- Setup the arch-specific module.
2380function _M.setup(arch, opt)
2381  g_arch, g_opt = arch, opt
2382end
2383
2384-- Merge the core maps and the arch-specific maps.
2385function _M.mergemaps(map_coreop, map_def)
2386  setmetatable(map_op, { __index = map_coreop })
2387  setmetatable(map_def, { __index = map_archdef })
2388  return map_op, map_def
2389end
2390
2391return _M
2392
2393------------------------------------------------------------------------------
2394
2395