1 /**
2  * Scanner that tokenizes Ethereum Virtual Machine (EVM) assembly language.
3  *
4  * Copyright © 2016, Eric Scrivner
5  *
6  * License: Subject to the terms of the MIT license, as written in the included
7  * LICENSE.txt file.
8  * Authors: Eric Scrivner
9  */
10 module phlogiston.assembler.scanner;
11 
12 import std.ascii;
13 import std.bigint;
14 import std.conv;
15 import std.exception;
16 import std.range;
17 import std..string;
18 
19 import phlogiston.assembler.token;
20 import phlogiston.evm.opcodes;
21 
22 /**
23  * This predicate indicates whether or not the given character is a newline
24  * character.
25  *
26  * Params:
27  *     c = The character to test
28  * 
29  * Returns: true if the character is a newline, false otherwise.
30  */
31 pure nothrow @nogc @safe bool isNewline(immutable dchar c) {
32     return c == '\n';
33 }
34 
35 ///
36 unittest {
37     assert(isNewline('\n'));
38     assert(!isNewline('a'));
39     assert(!isNewline(' '));
40 }
41 
42 /// Exception raised when an invalid character is encountered in input range.
43 class InvalidTokenException : Exception {
44     @safe pure nothrow this(string msg,
45                             string file = __FILE__,
46                             size_t line = __LINE__,
47                             Throwable next = null)
48         {
49             super(msg, file, line, next);
50         }
51 }
52 
53 /// Scans the input stream provided into a series of tokens.
54 class Scanner {
55     /// The input stream of characters.
56     private ubyte[] m_charStream;
57     /// The current line number in the input range.
58     private size_t m_lineNumber;
59     /// The current column number in the input range.
60     private size_t m_columnNumber;
61     /// The current token from the input range.
62     private Token m_currentToken;
63     /// Associative array of opcode names to bytecode values
64     private ubyte[string] m_validOpcodeNames;
65 
66     this(ubyte[] charStream) {
67         this.m_charStream = charStream;
68         this.m_lineNumber = 1;
69         this.m_columnNumber = 1;
70         this.m_currentToken = null;
71         this.m_validOpcodeNames = generateOpcodeNameToBytecodeMap();
72     }
73 
74     /**
75      * Returns: The current line number.
76      */
77     public @property const size_t lineNumber() {
78         return m_lineNumber;
79     }
80 
81     ///
82     unittest {
83         auto scanner = new Scanner(cast(ubyte[])"\n\n\r\n".representation);
84         assert(scanner.lineNumber == 1);
85     }
86 
87     /**
88      * Returns: The current column number.
89      */
90     public @property const size_t columnNumber() {
91         return m_columnNumber;
92     }
93 
94     ///
95     unittest {
96         auto scanner = new Scanner(cast(ubyte[])"abcd ".representation);
97         assert(scanner.columnNumber == 1);
98     }
99 
100     /**
101      * Returns: The most recent token from the input stream, or null if no
102      * tokens have yet been retrieved.
103      */
104     public @property Token currentToken() {
105         return m_currentToken;
106     }
107 
108     ///
109     unittest {
110         ubyte[] fixture = cast(ubyte[])"PUSH 1".representation;
111         auto scanner = new Scanner(fixture);
112         assert(scanner.currentToken is null);
113     }
114 
115     /**
116      * This routine consumes the input stream, returning the next token found.
117      * If a token could not be found, then an error is raised.
118      *
119      * Throws: InvalidTokenException if an invalid token is encountered.
120      *
121      * Returns: Next token in the input character stream.
122      */
123     public Token nextToken() {
124         if (m_charStream.empty) {
125             m_currentToken = new EndOfStream;
126         } else if (isWhite(m_charStream.front)) {
127             skipWhitespace();
128             m_currentToken = new Whitespace;
129         } else if (isAlpha(m_charStream.front)) {
130             m_currentToken = parseOpcode();
131         } else if (isDigit(m_charStream.front)) {
132             m_currentToken = parseNumber();
133         } else {
134             throw new InvalidTokenException(
135                 format("Invalid token '%c' (Line %d, Column %d)",
136                        cast(dchar)m_charStream.front,
137                        m_lineNumber,
138                        m_columnNumber));
139         }
140 
141         return m_currentToken;
142     }
143 
144     /**
145      * This routine skips ahead in the input range until the first
146      * non-whitespace character or the end of the range is encountered.
147      */
148     private void skipWhitespace() {
149         while (!m_charStream.empty && isWhite(m_charStream.front)) {
150             if (isNewline(m_charStream.front)) {
151                 m_lineNumber += 1;
152                 m_columnNumber = 1;
153             } else {
154                 m_columnNumber += 1;
155 
156             }
157             m_charStream.popFront();
158         }
159     }
160 
161     /**
162      * This routine parses a hexadecimal number from the input range.
163      *
164      * Returns: The token for the parsed number.
165      */
166     private Token parseNumber() {
167         if (m_charStream.length >= 3 && m_charStream[0..2] == "0x") {
168             return parseHexNumber();
169         }
170 
171         return parseDecimalNumber();
172     }
173 
174     /**
175      * This routine parses a hexadecimal number from the input range.
176      *
177      * Throws: InvalidTokenException if an token is encountered.
178      *
179      * Return: The token from the parsed input stream.
180      */
181     private Token parseHexNumber() {
182         ubyte[] hexNumber;
183 
184         while (!m_charStream.empty &&
185                (isHexDigit(m_charStream.front) || m_charStream.front == 'x')) {
186             hexNumber ~= m_charStream.front;
187 
188             m_columnNumber += 1;
189             m_charStream.popFront();
190         }
191 
192         try {
193             return new Number(BigInt(hexNumber.assumeUTF));
194         } catch(ConvException ce) {
195             throw new InvalidTokenException(
196                 format("Invalid hexadecimal number '%s' (Line %d, Column %d)",
197                        cast(string)hexNumber,
198                        lineNumber,
199                        columnNumber));
200         }
201     }
202 
203     /**
204      * This routine parses a decimal number from the input range.
205      *
206      * Returns: The parsed number from the input range.
207      */
208     private Token parseDecimalNumber() {
209         ubyte[] number;
210 
211         while (!m_charStream.empty && isDigit(m_charStream.front)) {
212             number ~= m_charStream.front;
213 
214             m_columnNumber += 1;
215             m_charStream.popFront();
216         }
217 
218         return new Number(BigInt(number.assumeUTF));
219     }
220 
221     /**
222      * This routine parses an opcode from the input range.
223      *
224      * Returns: The token for the parsed opcode.
225      */
226     private Token parseOpcode() {
227         if (m_charStream.length >= 4 && m_charStream[0..4] == "PUSH") {
228             return parsePushOpcode();
229         }
230 
231         return parseStackOpcode();
232     }
233 
234     /**
235      * This routine parses a push opcode from the input range.
236      *
237      * Throws: InvalidTokenException if an token is encountered.
238      *
239      * Returns: Token for the push opcode.
240      */   
241     private Token parsePushOpcode() {
242         ubyte[] opcode;
243 
244         while (!m_charStream.empty &&
245                (isAlpha(m_charStream.front) || 
246                 isDigit(m_charStream.front))) {
247             opcode ~= m_charStream.front();
248 
249             m_columnNumber += 1;
250             m_charStream.popFront();
251         }
252 
253         // Ensure the opcode is a valid push opcode
254         if (cast(string)opcode !in m_validOpcodeNames) {
255             throw new InvalidTokenException(
256                 format("Expected valid opcode, found '%s' (Line %d, Column %d)",
257                        cast(string)opcode,
258                        lineNumber,
259                        columnNumber));
260         }
261 
262         return new PushOpcode(opcode.assumeUTF);
263     }
264 
265     /**
266      * This routine parses a stack opcode from the input range.
267      *
268      * Throws: InvalidTokenException if an token is encountered.
269      *
270      * Returns: Token for the stack opcode.
271      */
272     private Token parseStackOpcode() {
273         ubyte[] opcode;
274 
275         while (!m_charStream.empty &&
276                (isAlpha(m_charStream.front) || isDigit(m_charStream.front))) {
277             opcode ~= m_charStream.front();
278 
279             m_columnNumber += 1;
280             m_charStream.popFront();
281         }
282 
283         // Ensure the opcode is a valid stack opcode
284         if (cast(string)opcode !in m_validOpcodeNames) {
285             throw new InvalidTokenException(
286                 format("Expected valid opcode, found '%s' (Line %d, Column %d)",
287                        cast(string)opcode,
288                        lineNumber,
289                        columnNumber));
290         }
291 
292         return new StackOpcode(opcode.assumeUTF);
293     }
294  }
295 
296 ///
297 unittest {
298     auto scanner = new Scanner(cast(ubyte[])"".representation);
299     assert(cast(EndOfStream)scanner.nextToken());
300 
301     scanner = new Scanner(cast(ubyte[])"\n\n\r\n  ".representation);
302     assert(cast(Whitespace)scanner.nextToken());
303     assert(scanner.lineNumber == 4);
304     assert(scanner.columnNumber == 3);
305 
306     scanner = new Scanner(cast(ubyte[])"STOP".representation);
307     StackOpcode stackToken = cast(StackOpcode)scanner.nextToken();
308     assert(stackToken.m_opcode == "STOP");
309 
310     scanner = new Scanner(cast(ubyte[])"PUSH1 0xa".representation);
311     PushOpcode pushToken = cast(PushOpcode)scanner.nextToken();
312     assert(pushToken.m_opcode == "PUSH1");
313     assert(cast(Whitespace)scanner.nextToken());
314     Number number = cast(Number)scanner.nextToken();
315     assert(number.m_value == BigInt(10));
316 
317     scanner = new Scanner(cast(ubyte[])"PUSH1 1234".representation);
318     pushToken = cast(PushOpcode)scanner.nextToken();
319     assert(pushToken.m_opcode == "PUSH1");
320     assert(cast(Whitespace)scanner.nextToken());
321     number = cast(Number)scanner.nextToken();
322     assert(number.m_value == BigInt("1234"));
323 }
324 
325 unittest {
326     // Invalid hex number
327     auto scanner = new Scanner(cast(ubyte[])"PUSH1 0x\nDUP1".representation);
328     scanner.nextToken();
329     scanner.nextToken();
330     assertThrown!InvalidTokenException(scanner.nextToken());
331 
332     // Invalid push opcode
333     scanner = new Scanner(cast(ubyte[])"PUSH36 0xa".representation);
334     assertThrown!InvalidTokenException(scanner.nextToken());
335 
336     // Invalid charactr in stream
337     scanner = new Scanner(cast(ubyte[])"PUSH1-0xfa".representation);
338     scanner.nextToken();
339     assertThrown!InvalidTokenException(scanner.nextToken());
340 }