1 /** 2 * Scanner that tokenizes Ethereum Virtual Machine (EVM) assembly language. 3 * 4 * Copyright © 2016, Eric Scrivner 5 * 6 * License: Subject to the terms of the MIT license, as written in the included 7 * LICENSE.txt file. 8 * Authors: Eric Scrivner 9 */ 10 module phlogiston.assembler.scanner; 11 12 import std.ascii; 13 import std.bigint; 14 import std.conv; 15 import std.exception; 16 import std.range; 17 import std..string; 18 19 import phlogiston.assembler.token; 20 import phlogiston.evm.opcodes; 21 22 /** 23 * This predicate indicates whether or not the given character is a newline 24 * character. 25 * 26 * Params: 27 * c = The character to test 28 * 29 * Returns: true if the character is a newline, false otherwise. 30 */ 31 pure nothrow @nogc @safe bool isNewline(immutable dchar c) { 32 return c == '\n'; 33 } 34 35 /// 36 unittest { 37 assert(isNewline('\n')); 38 assert(!isNewline('a')); 39 assert(!isNewline(' ')); 40 } 41 42 /// Exception raised when an invalid character is encountered in input range. 43 class InvalidTokenException : Exception { 44 @safe pure nothrow this(string msg, 45 string file = __FILE__, 46 size_t line = __LINE__, 47 Throwable next = null) 48 { 49 super(msg, file, line, next); 50 } 51 } 52 53 /// Scans the input stream provided into a series of tokens. 54 class Scanner { 55 /// The input stream of characters. 56 private ubyte[] m_charStream; 57 /// The current line number in the input range. 58 private size_t m_lineNumber; 59 /// The current column number in the input range. 60 private size_t m_columnNumber; 61 /// The current token from the input range. 62 private Token m_currentToken; 63 /// Associative array of opcode names to bytecode values 64 private ubyte[string] m_validOpcodeNames; 65 66 this(ubyte[] charStream) { 67 this.m_charStream = charStream; 68 this.m_lineNumber = 1; 69 this.m_columnNumber = 1; 70 this.m_currentToken = null; 71 this.m_validOpcodeNames = generateOpcodeNameToBytecodeMap(); 72 } 73 74 /** 75 * Returns: The current line number. 76 */ 77 public @property const size_t lineNumber() { 78 return m_lineNumber; 79 } 80 81 /// 82 unittest { 83 auto scanner = new Scanner(cast(ubyte[])"\n\n\r\n".representation); 84 assert(scanner.lineNumber == 1); 85 } 86 87 /** 88 * Returns: The current column number. 89 */ 90 public @property const size_t columnNumber() { 91 return m_columnNumber; 92 } 93 94 /// 95 unittest { 96 auto scanner = new Scanner(cast(ubyte[])"abcd ".representation); 97 assert(scanner.columnNumber == 1); 98 } 99 100 /** 101 * Returns: The most recent token from the input stream, or null if no 102 * tokens have yet been retrieved. 103 */ 104 public @property Token currentToken() { 105 return m_currentToken; 106 } 107 108 /// 109 unittest { 110 ubyte[] fixture = cast(ubyte[])"PUSH 1".representation; 111 auto scanner = new Scanner(fixture); 112 assert(scanner.currentToken is null); 113 } 114 115 /** 116 * This routine consumes the input stream, returning the next token found. 117 * If a token could not be found, then an error is raised. 118 * 119 * Throws: InvalidTokenException if an invalid token is encountered. 120 * 121 * Returns: Next token in the input character stream. 122 */ 123 public Token nextToken() { 124 if (m_charStream.empty) { 125 m_currentToken = new EndOfStream; 126 } else if (isWhite(m_charStream.front)) { 127 skipWhitespace(); 128 m_currentToken = new Whitespace; 129 } else if (isAlpha(m_charStream.front)) { 130 m_currentToken = parseOpcode(); 131 } else if (isDigit(m_charStream.front)) { 132 m_currentToken = parseNumber(); 133 } else { 134 throw new InvalidTokenException( 135 format("Invalid token '%c' (Line %d, Column %d)", 136 cast(dchar)m_charStream.front, 137 m_lineNumber, 138 m_columnNumber)); 139 } 140 141 return m_currentToken; 142 } 143 144 /** 145 * This routine skips ahead in the input range until the first 146 * non-whitespace character or the end of the range is encountered. 147 */ 148 private void skipWhitespace() { 149 while (!m_charStream.empty && isWhite(m_charStream.front)) { 150 if (isNewline(m_charStream.front)) { 151 m_lineNumber += 1; 152 m_columnNumber = 1; 153 } else { 154 m_columnNumber += 1; 155 156 } 157 m_charStream.popFront(); 158 } 159 } 160 161 /** 162 * This routine parses a hexadecimal number from the input range. 163 * 164 * Returns: The token for the parsed number. 165 */ 166 private Token parseNumber() { 167 if (m_charStream.length >= 3 && m_charStream[0..2] == "0x") { 168 return parseHexNumber(); 169 } 170 171 return parseDecimalNumber(); 172 } 173 174 /** 175 * This routine parses a hexadecimal number from the input range. 176 * 177 * Throws: InvalidTokenException if an token is encountered. 178 * 179 * Return: The token from the parsed input stream. 180 */ 181 private Token parseHexNumber() { 182 ubyte[] hexNumber; 183 184 while (!m_charStream.empty && 185 (isHexDigit(m_charStream.front) || m_charStream.front == 'x')) { 186 hexNumber ~= m_charStream.front; 187 188 m_columnNumber += 1; 189 m_charStream.popFront(); 190 } 191 192 try { 193 return new Number(BigInt(hexNumber.assumeUTF)); 194 } catch(ConvException ce) { 195 throw new InvalidTokenException( 196 format("Invalid hexadecimal number '%s' (Line %d, Column %d)", 197 cast(string)hexNumber, 198 lineNumber, 199 columnNumber)); 200 } 201 } 202 203 /** 204 * This routine parses a decimal number from the input range. 205 * 206 * Returns: The parsed number from the input range. 207 */ 208 private Token parseDecimalNumber() { 209 ubyte[] number; 210 211 while (!m_charStream.empty && isDigit(m_charStream.front)) { 212 number ~= m_charStream.front; 213 214 m_columnNumber += 1; 215 m_charStream.popFront(); 216 } 217 218 return new Number(BigInt(number.assumeUTF)); 219 } 220 221 /** 222 * This routine parses an opcode from the input range. 223 * 224 * Returns: The token for the parsed opcode. 225 */ 226 private Token parseOpcode() { 227 if (m_charStream.length >= 4 && m_charStream[0..4] == "PUSH") { 228 return parsePushOpcode(); 229 } 230 231 return parseStackOpcode(); 232 } 233 234 /** 235 * This routine parses a push opcode from the input range. 236 * 237 * Throws: InvalidTokenException if an token is encountered. 238 * 239 * Returns: Token for the push opcode. 240 */ 241 private Token parsePushOpcode() { 242 ubyte[] opcode; 243 244 while (!m_charStream.empty && 245 (isAlpha(m_charStream.front) || 246 isDigit(m_charStream.front))) { 247 opcode ~= m_charStream.front(); 248 249 m_columnNumber += 1; 250 m_charStream.popFront(); 251 } 252 253 // Ensure the opcode is a valid push opcode 254 if (cast(string)opcode !in m_validOpcodeNames) { 255 throw new InvalidTokenException( 256 format("Expected valid opcode, found '%s' (Line %d, Column %d)", 257 cast(string)opcode, 258 lineNumber, 259 columnNumber)); 260 } 261 262 return new PushOpcode(opcode.assumeUTF); 263 } 264 265 /** 266 * This routine parses a stack opcode from the input range. 267 * 268 * Throws: InvalidTokenException if an token is encountered. 269 * 270 * Returns: Token for the stack opcode. 271 */ 272 private Token parseStackOpcode() { 273 ubyte[] opcode; 274 275 while (!m_charStream.empty && 276 (isAlpha(m_charStream.front) || isDigit(m_charStream.front))) { 277 opcode ~= m_charStream.front(); 278 279 m_columnNumber += 1; 280 m_charStream.popFront(); 281 } 282 283 // Ensure the opcode is a valid stack opcode 284 if (cast(string)opcode !in m_validOpcodeNames) { 285 throw new InvalidTokenException( 286 format("Expected valid opcode, found '%s' (Line %d, Column %d)", 287 cast(string)opcode, 288 lineNumber, 289 columnNumber)); 290 } 291 292 return new StackOpcode(opcode.assumeUTF); 293 } 294 } 295 296 /// 297 unittest { 298 auto scanner = new Scanner(cast(ubyte[])"".representation); 299 assert(cast(EndOfStream)scanner.nextToken()); 300 301 scanner = new Scanner(cast(ubyte[])"\n\n\r\n ".representation); 302 assert(cast(Whitespace)scanner.nextToken()); 303 assert(scanner.lineNumber == 4); 304 assert(scanner.columnNumber == 3); 305 306 scanner = new Scanner(cast(ubyte[])"STOP".representation); 307 StackOpcode stackToken = cast(StackOpcode)scanner.nextToken(); 308 assert(stackToken.m_opcode == "STOP"); 309 310 scanner = new Scanner(cast(ubyte[])"PUSH1 0xa".representation); 311 PushOpcode pushToken = cast(PushOpcode)scanner.nextToken(); 312 assert(pushToken.m_opcode == "PUSH1"); 313 assert(cast(Whitespace)scanner.nextToken()); 314 Number number = cast(Number)scanner.nextToken(); 315 assert(number.m_value == BigInt(10)); 316 317 scanner = new Scanner(cast(ubyte[])"PUSH1 1234".representation); 318 pushToken = cast(PushOpcode)scanner.nextToken(); 319 assert(pushToken.m_opcode == "PUSH1"); 320 assert(cast(Whitespace)scanner.nextToken()); 321 number = cast(Number)scanner.nextToken(); 322 assert(number.m_value == BigInt("1234")); 323 } 324 325 unittest { 326 // Invalid hex number 327 auto scanner = new Scanner(cast(ubyte[])"PUSH1 0x\nDUP1".representation); 328 scanner.nextToken(); 329 scanner.nextToken(); 330 assertThrown!InvalidTokenException(scanner.nextToken()); 331 332 // Invalid push opcode 333 scanner = new Scanner(cast(ubyte[])"PUSH36 0xa".representation); 334 assertThrown!InvalidTokenException(scanner.nextToken()); 335 336 // Invalid charactr in stream 337 scanner = new Scanner(cast(ubyte[])"PUSH1-0xfa".representation); 338 scanner.nextToken(); 339 assertThrown!InvalidTokenException(scanner.nextToken()); 340 }