comparison fontaine/parser.py @ 1:74b83e3d921e

Add more states, add more tests.
author Ludovic Chabant <ludovic@chabant.com>
date Mon, 02 Jan 2017 21:54:59 -0800
parents 243401c49520
children 59fe8cb6190d
comparison
equal deleted inserted replaced
0:243401c49520 1:74b83e3d921e
5 logger = logging.getLogger(__name__) 5 logger = logging.getLogger(__name__)
6 6
7 7
8 class FontaineState: 8 class FontaineState:
9 can_merge = False 9 can_merge = False
10 needs_pending_empty_lines = True
11
12 def __init__(self):
13 self.has_pending_empty_line = False
10 14
11 def match(self, fp, ctx): 15 def match(self, fp, ctx):
12 return False 16 return False
13 17
14 def consume(self, fp, ctx): 18 def consume(self, fp, ctx):
15 raise NotImplementedError() 19 raise NotImplementedError()
20
21 def merge(self):
22 pass
16 23
17 def exit(self, ctx): 24 def exit(self, ctx):
18 pass 25 pass
26
27
28 class _PassThroughState(FontaineState):
29 def consume(self, fp, ctx):
30 return ANY_STATE
19 31
20 32
21 class FontaineParserError(Exception): 33 class FontaineParserError(Exception):
22 def __init__(self, line_no, message): 34 def __init__(self, line_no, message):
23 super().__init__("Error line %d: %s" % (line_no, message)) 35 super().__init__("Error line %d: %s" % (line_no, message))
37 def __init__(self): 49 def __init__(self):
38 super().__init__() 50 super().__init__()
39 self._cur_key = None 51 self._cur_key = None
40 self._cur_val = None 52 self._cur_val = None
41 53
42 def consume(self, fp, ctx): 54 def match(self, fp, ctx):
43 line = fp.readline() 55 line = fp.peekline()
44 if not line: 56 return RE_TITLE_KEY_VALUE.match(line)
45 return EOF_STATE 57
46 58 def consume(self, fp, ctx):
47 if RE_EMPTY_LINE.match(line): 59 while True:
48 self._commit(ctx) 60 line = fp.readline()
49 # Finished with the page title, now move on to the first scene. 61 if not line:
50 # However, if we never had any page title, go back to the beginning 62 return EOF_STATE
51 # so we don't consume anybody else's empty lines. 63
52 if len(ctx.document.title_values) == 0: 64 m = RE_TITLE_KEY_VALUE.match(line)
53 fp.seek0() 65 if m:
54 return ANY_STATE 66 # Commit current value, start new one.
55 67 self._commit(ctx)
56 m = RE_TITLE_KEY_VALUE.match(line) 68 self._cur_key = m.group('key')
57 if m: 69 self._cur_val = line[m.end():].strip()
58 # Commit current value, start new one. 70 else:
59 self._commit(ctx) 71 # Keep accumulating the value of one of the title page's
60 self._cur_key = m.group('key') 72 # values.
61 self._cur_val = line[m.end():].strip() 73 self._cur_val += line.strip()
62 else: 74
63 if self._cur_val is None: 75 if RE_EMPTY_LINE.match(fp.peekline()):
64 if len(ctx.document.title_values) == 0: 76 self._commit(ctx)
65 # Early exit because there's no title page. 77 # Finished with the page title, now move on to the first scene.
66 # Go back to the beginning so we don't consume somebody's 78 self.has_pending_empty_line = True
67 # first line of text. 79 break
68 fp.seek0() 80
69 return ANY_STATE 81 return ANY_STATE
70
71 raise FontaineParserError(
72 fp.line_no,
73 "Page title needs to be followed by 2 empty lines.")
74
75 # Keep accumulating the value of one of the title page's values.
76 self._cur_val += line.strip()
77 return True
78 82
79 def exit(self, ctx): 83 def exit(self, ctx):
80 self._commit(ctx) 84 self._commit(ctx)
81 85
82 def _commit(self, ctx): 86 def _commit(self, ctx):
90 r"^(int|ext|est|int/ext|int./ext|i/e)[\s\.]", re.I) 94 r"^(int|ext|est|int/ext|int./ext|i/e)[\s\.]", re.I)
91 95
92 96
93 class _SceneHeaderState(FontaineState): 97 class _SceneHeaderState(FontaineState):
94 def match(self, fp, ctx): 98 def match(self, fp, ctx):
95 lines = fp.peeklines(2) 99 lines = fp.peeklines(3)
96 return ( 100 return (
97 RE_EMPTY_LINE.match(lines[0]) and 101 RE_EMPTY_LINE.match(lines[0]) and
98 RE_SCENE_HEADER_PATTERN.match(lines[1])) 102 RE_SCENE_HEADER_PATTERN.match(lines[1]) and
103 RE_EMPTY_LINE.match(lines[2]))
99 104
100 def consume(self, fp, ctx): 105 def consume(self, fp, ctx):
101 fp.readline() # Get past the blank line. 106 fp.readline() # Get past the blank line.
102 line = fp.readline().rstrip('\r\n') 107 line = fp.readline().rstrip('\r\n')
103 line = line.lstrip('.') # In case it was forced. 108 line = line.lstrip('.') # In case it was forced.
104 ctx.document.addScene(line) 109 ctx.document.addScene(line)
110 self.has_pending_empty_line = True
105 return ANY_STATE 111 return ANY_STATE
106 112
107 113
108 class _ActionState(FontaineState): 114 class _ActionState(FontaineState):
109 can_merge = True 115 can_merge = True
116 needs_pending_empty_lines = False
110 117
111 def __init__(self): 118 def __init__(self):
112 super().__init__() 119 super().__init__()
113 self.text = '' 120 self.text = ''
121 self._to_merge = None
122 self._was_merged = False
114 123
115 def match(self, fp, ctx): 124 def match(self, fp, ctx):
116 return True 125 return True
117 126
118 def consume(self, fp, ctx): 127 def consume(self, fp, ctx):
121 line = fp.readline() 130 line = fp.readline()
122 if not line: 131 if not line:
123 return EOF_STATE 132 return EOF_STATE
124 133
125 if is_first_line: 134 if is_first_line:
126 line = line.lstrip('!') 135 line = line.lstrip('!') # In case it was forced.
127 is_first_line = False 136 is_first_line = False
128 137
138 # If the next line is empty, strip the carriage return from
139 # the line we just got because it's probably gonna be the
140 # last one.
141 if RE_EMPTY_LINE.match(fp.peekline()):
142 stripped_line = line.rstrip("\r\n")
143 self.text += stripped_line
144 self._to_merge = line[len(stripped_line):]
145 break
146 # ...otherwise, add the line with in full.
129 self.text += line 147 self.text += line
130 148
131 if RE_EMPTY_LINE.match(fp.peekline()): 149 return ANY_STATE
132 break 150
133 151 def merge(self):
134 return ANY_STATE 152 # Put back the stuff we stripped from what we thought was the
153 # last line.
154 self.text += self._to_merge
155 self._was_merged = True
135 156
136 def exit(self, ctx): 157 def exit(self, ctx):
137 ctx.document.lastScene().addAction(self.text) 158 ctx.document.lastScene().addAction(self.text)
138 159
139 160
140 RE_CHARACTER_LINE = re.compile(r"^[A-Z\-]+\s*(\(.*\))?$", re.M) 161 RE_CENTERED_LINE = re.compile(r"^\s*>\s*.*\s*<\s*$", re.M)
162
163
164 class _CenteredActionState(FontaineState):
165 def __init__(self):
166 super().__init__()
167 self.text = ''
168 self._aborted = False
169
170 def match(self, fp, ctx):
171 lines = fp.peeklines(2)
172 return (
173 RE_EMPTY_LINE.match(lines[0]) and
174 RE_CENTERED_LINE.match(lines[1]))
175
176 def consume(self, fp, ctx):
177 snapshot = fp.snapshot()
178 fp.readline() # Get past the empty line.
179 while True:
180 line = fp.readline()
181 if not line:
182 return EOF_STATE
183
184 clean_line = line.rstrip('\r\n')
185 eol = line[len(clean_line):]
186
187 clean_line = clean_line.strip()
188 if clean_line[0] != '>' or clean_line[-1] != '<':
189 # The whole paragraph must have `>` and `<` wrappers, so
190 # if we detect a line that doesn't have them, we make this
191 # paragraph be a normal action instead.
192 fp.restore(snapshot)
193 self.has_pending_empty_line = True
194 self._aborted = True
195 return _ActionState()
196 else:
197 # Remove wrapping `>`/`<`, and spaces.
198 clean_line = clean_line[1:-1].strip()
199
200 if RE_EMPTY_LINE.match(fp.peekline()):
201 self.text += clean_line
202 self.has_pending_empty_line = True
203 break
204 self.text += clean_line + eol
205
206 return ANY_STATE
207
208 def exit(self, ctx):
209 if not self._aborted:
210 ctx.document.lastScene().addCenteredAction(self.text)
211
212
213 RE_CHARACTER_LINE = re.compile(r"^\s*[A-Z\-]+\s*(\(.*\))?$", re.M)
141 214
142 215
143 class _CharacterState(FontaineState): 216 class _CharacterState(FontaineState):
144 def match(self, fp, ctx): 217 def match(self, fp, ctx):
145 lines = fp.peeklines(3) 218 lines = fp.peeklines(3)
148 not RE_EMPTY_LINE.match(lines[2])) 221 not RE_EMPTY_LINE.match(lines[2]))
149 222
150 def consume(self, fp, ctx): 223 def consume(self, fp, ctx):
151 fp.readline() # Get past the empty line. 224 fp.readline() # Get past the empty line.
152 line = fp.readline().rstrip('\r\n') 225 line = fp.readline().rstrip('\r\n')
226 line = line.lstrip() # Remove indenting.
153 line = line.lstrip('@') # In case it was forced. 227 line = line.lstrip('@') # In case it was forced.
154 ctx.document.lastScene().addCharacter(line) 228 ctx.document.lastScene().addCharacter(line)
155 return [_ParentheticalState, _DialogState] 229 return [_ParentheticalState, _DialogState]
156 230
157 231
164 # one is already that. 238 # one is already that.
165 line = fp.peekline() 239 line = fp.peekline()
166 return RE_PARENTHETICAL_LINE.match(line) 240 return RE_PARENTHETICAL_LINE.match(line)
167 241
168 def consume(self, fp, ctx): 242 def consume(self, fp, ctx):
169 line = fp.readline().rstrip('\r\n') 243 line = fp.readline().lstrip().rstrip('\r\n')
170 ctx.document.lastScene().addParenthetical(line) 244 ctx.document.lastScene().addParenthetical(line)
171 return [_DialogState, _CharacterState, _ActionState] 245
246 next_line = fp.peekline()
247 if not RE_EMPTY_LINE.match(next_line):
248 return _DialogState()
249
250 self.has_pending_empty_line = True
251 return ANY_STATE
172 252
173 253
174 class _DialogState(FontaineState): 254 class _DialogState(FontaineState):
175 def __init__(self): 255 def __init__(self):
176 super().__init__() 256 super().__init__()
177 self.text = '' 257 self.text = ''
178 258
179 def match(self, fp, ctx): 259 def match(self, fp, ctx):
260 # We only get here from a `_CharacterState` or `_ParentheticalState`
261 # so we just need to check there's some text.
180 line = fp.peekline() 262 line = fp.peekline()
181 return not RE_EMPTY_LINE.match(line) 263 return not RE_EMPTY_LINE.match(line)
182 264
183 def consume(self, fp, ctx): 265 def consume(self, fp, ctx):
184 while True: 266 while True:
185 line = fp.readline() 267 line = fp.readline()
186 if not line: 268 if not line:
187 return EOF_STATE 269 return EOF_STATE
270
271 line = line.lstrip() # Remove indenting.
272
273 # Next we could be either continuing the dialog line, going to
274 # a parenthetical, or exiting dialog altogether.
275 next_line = fp.peekline()
276
277 if RE_PARENTHETICAL_LINE.match(next_line):
278 self.text += line.rstrip('\r\n')
279 return _ParentheticalState()
280
281 if RE_EMPTY_LINE.match(next_line):
282 self.text += line.rstrip('\r\n')
283 self.has_pending_empty_line = True
284 break
188 self.text += line 285 self.text += line
189 if RE_EMPTY_LINE.match(fp.peekline()): 286
190 break
191 return ANY_STATE 287 return ANY_STATE
192 288
193 def exit(self, ctx): 289 def exit(self, ctx):
194 ctx.document.lastScene().addDialog(self.text.rstrip('\r\n')) 290 ctx.document.lastScene().addDialog(self.text.rstrip('\r\n'))
195 291
196 292
197 class _LyricsState(FontaineState): 293 class _LyricsState(FontaineState):
198 pass 294 def __init__(self):
295 super().__init__()
296 self.text = ''
297 self._aborted = False
298
299 # No `match` method, this can only be forced.
300 # (see `_ForcedParagraphStates`)
301
302 def consume(self, fp, ctx):
303 snapshot = fp.snapshot()
304 fp.readline() # Get past the empty line.
305 while True:
306 line = fp.readline()
307 if not line:
308 return EOF_STATE
309
310 if line.startswith('~'):
311 line = line.lstrip('~')
312 else:
313 logger.debug("Rolling back lyrics into action paragraph.")
314 fp.restore(snapshot)
315 self.has_pending_empty_line = True
316 self._aborted = True
317 return _ActionState()
318
319 if RE_EMPTY_LINE.match(fp.peekline()):
320 self.text += line.rstrip('\r\n')
321 self.has_pending_empty_line = True
322 break
323 self.text += line
324
325 return ANY_STATE
326
327 def exit(self, ctx):
328 if not self._aborted:
329 ctx.document.lastScene().addLyrics(self.text)
330
331
332 RE_TRANSITION_LINE = re.compile(r"^\s*[^a-z]+TO\:$", re.M)
199 333
200 334
201 class _TransitionState(FontaineState): 335 class _TransitionState(FontaineState):
202 pass 336 def match(self, fp, ctx):
337 lines = fp.peeklines(3)
338 return (
339 RE_EMPTY_LINE.match(lines[0]) and
340 RE_TRANSITION_LINE.match(lines[1]) and
341 RE_EMPTY_LINE.match(lines[2]))
342
343 def consume(self, fp, ctx):
344 fp.readline() # Get past the empty line.
345 line = fp.readline().lstrip().rstrip('\r\n')
346 line = line.lstrip('>') # In case it was forced.
347 ctx.document.lastScene().addTransition(line)
348 self.has_pending_empty_line = True
349
350
351 RE_PAGE_BREAK_LINE = re.compile(r"^\=\=\=+$", re.M)
352
353
354 class _PageBreakState(FontaineState):
355 def match(self, fp, ctx):
356 lines = fp.peeklines(3)
357 return (
358 RE_EMPTY_LINE.match(lines[0]) and
359 RE_PAGE_BREAK_LINE.match(lines[1]) and
360 RE_EMPTY_LINE.match(lines[2]))
361
362 def consume(self, fp, ctx):
363 fp.readline()
364 fp.readline()
365 ctx.document.lastScene().addPageBreak()
366 self.has_pending_empty_line = True
367 return ANY_STATE
203 368
204 369
205 class _ForcedParagraphStates(FontaineState): 370 class _ForcedParagraphStates(FontaineState):
206 STATE_SYMBOLS = { 371 STATE_SYMBOLS = {
207 '.': _SceneHeaderState, 372 '.': _SceneHeaderState,
212 } 377 }
213 378
214 def __init__(self): 379 def __init__(self):
215 super().__init__() 380 super().__init__()
216 self._state_cls = None 381 self._state_cls = None
382 self._consume_empty_line = False
217 383
218 def match(self, fp, ctx): 384 def match(self, fp, ctx):
219 lines = fp.peeklines(2) 385 lines = fp.peeklines(2)
386 symbol = lines[1][:1]
220 if (RE_EMPTY_LINE.match(lines[0]) and 387 if (RE_EMPTY_LINE.match(lines[0]) and
221 lines[1][:1] in self.STATE_SYMBOLS): 388 symbol in self.STATE_SYMBOLS):
222 self._state_cls = self.STATE_SYMBOLS[lines[1][:1]] 389 # Special case: don't force a transition state if it's
390 # really some centered text.
391 if symbol == '>' and RE_CENTERED_LINE.match(lines[1]):
392 return False
393
394 self._state_cls = self.STATE_SYMBOLS[symbol]
395
396 # Special case: for forced action paragraphs, don't leave
397 # the blank line there.
398 if symbol == '!':
399 self._consume_empty_line = True
400
223 return True 401 return True
224 return False 402 return False
225 403
226 def consume(self, fp, ctx): 404 def consume(self, fp, ctx):
405 if self._consume_empty_line:
406 fp.readline()
227 return self._state_cls() 407 return self._state_cls()
228 408
229 409
230 STATES = [ 410 ROOT_STATES = [
231 _ForcedParagraphStates, # Must be first. 411 _ForcedParagraphStates, # Must be first.
232 _SceneHeaderState, 412 _SceneHeaderState,
233 _CharacterState, 413 _CharacterState,
234 _TransitionState, 414 _TransitionState,
415 _PageBreakState,
416 _CenteredActionState,
235 _ActionState, # Must be last. 417 _ActionState, # Must be last.
236 ] 418 ]
237 419
238 420
239 class _PeekableFile: 421 class _PeekableFile:
240 def __init__(self, fp): 422 def __init__(self, fp):
241 self.line_no = 1 423 self.line_no = 1
242 self._fp = fp 424 self._fp = fp
243 425 self._blankAt0 = False
244 def read(self, size=-1):
245 return self._doRead(size, True)
246
247 def read1(self):
248 return self.read(1)
249
250 def peek1(self):
251 pos = self._fp.tell()
252 c = self._doRead(1, False)
253 self._fp.seek(pos)
254 return c
255 426
256 def readline(self, size=-1): 427 def readline(self, size=-1):
428 if self._blankAt0:
429 self._blankAt0 = False
430 return '\n'
431
257 data = self._fp.readline(size) 432 data = self._fp.readline(size)
258 self.line_no += 1 433 self.line_no += 1
259 return data 434 return data
260 435
261 def peekline(self): 436 def peekline(self):
437 if self._blankAt0:
438 return '\n'
439
262 pos = self._fp.tell() 440 pos = self._fp.tell()
263 line = self._fp.readline() 441 line = self._fp.readline()
264 self._fp.seek(pos) 442 self._fp.seek(pos)
265 return line 443 return line
266 444
267 def peeklines(self, count): 445 def peeklines(self, count):
268 pos = self._fp.tell() 446 pos = self._fp.tell()
269 lines = [] 447 lines = []
448 if self._blankAt0:
449 lines.append('\n')
450 count -= 1
270 for i in range(count): 451 for i in range(count):
271 lines.append(self._fp.readline()) 452 lines.append(self._fp.readline())
272 self._fp.seek(pos) 453 self._fp.seek(pos)
273 return lines 454 return lines
274 455
275 def seek0(self): 456 def snapshot(self):
276 self._fp.seek(0) 457 return (self._fp.tell(), self._blankAt0, self.line_no)
277 self.line_no = 1 458
278 459 def restore(self, snapshot):
279 def _doRead(self, size, advance_line_no): 460 self._fp.seek(snapshot[0])
461 self._blankAt0 = snapshot[1]
462 self.line_no = snapshot[2]
463
464 def _addBlankAt0(self):
465 if self._fp.tell() != 0:
466 raise Exception(
467 "Can't add blank line at 0 if reading has started.")
468 self._blankAt0 = True
469
470 def _read(self, size, advance_line_no):
280 data = self._fp.read(size) 471 data = self._fp.read(size)
281 if advance_line_no: 472 if advance_line_no:
282 self.line_no += data.count('\n') 473 self.line_no += data.count('\n')
283 return data 474 return data
284 475
292 @property 483 @property
293 def line_no(self): 484 def line_no(self):
294 return self.fp.line_no 485 return self.fp.line_no
295 486
296 def run(self): 487 def run(self):
488 # Start with the page title... unless it doesn't match, in which
489 # case we start with a "pass through" state that will just return
490 # `ANY_STATE` so we can start matching stuff.
297 self.state = _TitlePageState() 491 self.state = _TitlePageState()
492 if not self.state.match(self.fp, self):
493 logger.debug("No title page value found on line 1, "
494 "using pass-through state with added blank line.")
495 self.state = _PassThroughState()
496 if not RE_EMPTY_LINE.match(self.fp.peekline()):
497 # Add a fake empty line at the beginning of the text if
498 # there's not one already. This makes state matching easier.
499 self.fp._addBlankAt0()
500 # Make this added empty line "pending" so if the first line
501 # is an action paragraph, it doesn't include it.
502 self.state.has_pending_empty_line = True
503
504 # Start parsing! Here we try to do a mostly-forward-only parser with
505 # non overlapping regexes to make it decently fast.
298 while True: 506 while True:
299 logger.debug("State '%s' consuming from '%s'..." % 507 logger.debug("State '%s' consuming from '%s'..." %
300 (self.state.__class__.__name__, self.fp.peekline())) 508 (self.state.__class__.__name__, self.fp.peekline()))
301 res = self.state.consume(self.fp, self) 509 res = self.state.consume(self.fp, self)
302 510
311 raise Exception( 519 raise Exception(
312 "States need to return `ANY_STATE`, one or more specific " 520 "States need to return `ANY_STATE`, one or more specific "
313 "states, or `EOF_STATE` if they reached the end of the " 521 "states, or `EOF_STATE` if they reached the end of the "
314 "file.") 522 "file.")
315 523
316 if res is True: 524 elif res is ANY_STATE or isinstance(res, list):
317 # State continues to consume.
318 continue
319
320 if res is ANY_STATE or isinstance(res, list):
321 # State wants to exit, we need to figure out what is the 525 # State wants to exit, we need to figure out what is the
322 # next state. 526 # next state.
323 pos = self.fp._fp.tell() 527 pos = self.fp._fp.tell()
324 next_states = res 528 next_states = res
325 if next_states is ANY_STATE: 529 if next_states is ANY_STATE:
326 next_states = STATES 530 next_states = ROOT_STATES
327 logger.debug("Trying to match next state from: %s" % 531 logger.debug("Trying to match next state from: %s" %
328 [t.__name__ for t in next_states]) 532 [t.__name__ for t in next_states])
329 for sc in next_states: 533 for sc in next_states:
330 s = sc() 534 s = sc()
331 if s.match(self.fp, self): 535 if s.match(self.fp, self):
335 res = s 539 res = s
336 break 540 break
337 else: 541 else:
338 raise Exception("Can't match following state after: %s" % 542 raise Exception("Can't match following state after: %s" %
339 self.state) 543 self.state)
544
545 # Handle the current state before we move on to the new one.
340 if self.state: 546 if self.state:
341 if type(self.state) == type(res) and self.state.can_merge: 547 if type(self.state) == type(res) and self.state.can_merge:
342 # Don't switch states if the next state is the same 548 # Don't switch states if the next state is the same
343 # type and that type supports merging. 549 # type and that type supports merging.
550 self.state.merge()
344 continue 551 continue
345 552
346 self.state.exit(self) 553 self.state.exit(self)
554 if (self.state.has_pending_empty_line and
555 not res.needs_pending_empty_lines):
556 logger.debug("Skipping pending blank line from %s" %
557 self.state.__class__.__name__)
558 self.fp.readline()
347 559
348 self.state = res 560 self.state = res
349 continue 561
350 562 elif isinstance(res, FontaineState):
351 if isinstance(res, FontaineState):
352 # State wants to exit, wants a specific state to be next. 563 # State wants to exit, wants a specific state to be next.
353 if self.state: 564 if self.state:
354 self.state.exit(self) 565 self.state.exit(self)
566 if (self.state.has_pending_empty_line and
567 not res.needs_pending_empty_lines):
568 logger.debug("Skipping pending blank line from %s" %
569 self.state.__class__.__name__)
570 self.fp.readline()
355 self.state = res 571 self.state = res
356 continue 572
357 573 elif res is EOF_STATE:
358 if res is EOF_STATE:
359 # Reached end of file. 574 # Reached end of file.
360 if self.state: 575 if self.state:
361 self.state.exit(self) 576 self.state.exit(self)
362 break 577 break
363 578
364 raise Exception("Unsupported state result: %s" % res) 579 else:
580 raise Exception("Unsupported state result: %s" % res)
365 581
366 582
367 class FontaineParser: 583 class FontaineParser:
368 def __init__(self): 584 def __init__(self):
369 pass 585 pass