lib/src/line_splitter.dart - Issue 1257903002: Optimize splitting long, complex lines.

Side by Side Diff: lib/src/line_splitter.dart

Issue 1257903002: Optimize splitting long, complex lines. (Closed) Base URL: https://github.com/dart-lang/dart_style.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2015, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 library dart_style.src.line_splitter;

6

7 import 'package:collection/priority_queue.dart';

8

9 import 'chunk.dart';

10 import 'debug.dart' as debug;

11 import 'line_writer.dart';

12 import 'rule/rule.dart';

13 import 'rule_set.dart';

14

15 /// To ensure the solver doesn't go totally pathological on giant code, we cap

16 /// it at a fixed number of attempts.

17 ///

18 /// If the optimal solution isn't found after this many tries, it just uses the

19 /// best it found so far.

20 const _maxAttempts = 50000;

21

22 /// Takes a set of chunks and determines the best values for its rules in order

23 /// to fit it inside the page boundary.

24 ///

25 /// This problem is exponential in the number of rules and a single expression

26 /// in Dart can be quite large, so it isn't feasible to brute force this. For

27 /// example:

28 ///

29 /// outer(

30 /// fn(1 + 2, 3 + 4, 5 + 6, 7 + 8),

31 /// fn(1 + 2, 3 + 4, 5 + 6, 7 + 8),

32 /// fn(1 + 2, 3 + 4, 5 + 6, 7 + 8),

33 /// fn(1 + 2, 3 + 4, 5 + 6, 7 + 8));

34 ///

35 /// There are 509,607,936 ways this can be split.

36 ///

37 /// The problem is even harder because we may not be able to easily tell if a

38 /// given solution is the best one. It's possible that there is no solution

39 /// that fits in the page (due to long strings or identifiers) so the winning

40 /// solution may still have overflow characters. This makes it hard to know

41 /// when we are done and can stop looking.

42 ///

43 /// There are a couple of pieces of domain knowledge we use to cope with this:

44 ///

45 /// - Changing a rule from unsplit to split will never lower its cost. A

46 /// solution with all rules unsplit will always be the one with the lowest

47 /// cost (zero). Conversely, setting all of its rules to the maximum split

48 /// value will always have the highest cost.

49 ///

50 /// (You might think there is a converse rule about overflow characters. The

51 /// solution with the fewest splits will have the most overflow, and the

52 /// solution with the most splits will have the least overflow. Alas, because

53 /// of indentation, that isn't always the case. Adding a split may increase

54 /// overflow in some cases.)

55 ///

56 /// - If all of the chunks for a rule are inside lines that already fit in the

57 /// page, then splitting that rule will never improve the solution.

58 ///

59 /// We start off with a [SolveState] where all rules are unbound (which

60 /// implicitly treats them as unsplit). For a given solve state, we can produce

61 /// a set of expanded states that takes some of the rules in the first long

62 /// line and bind them to split values. This always produces new solve states

63 /// with higher cost (but often fewer overflow characters) than the parent

64 /// state.

65 ///

66 /// We take these expanded states and add them to a work list sorted by cost.

67 /// Since unsplit rules always have lower cost solutions, we know that no state

68 /// we enqueue later will ever have a lower cost than the ones we already have

69 /// enqueued.

70 ///

71 /// Then we keep pulling states off the work list and expanding them and adding

72 /// the results back into the list. We do this until we hit a solution where

73 /// all characters fit in the page. The first one we find will have the lowest

74 /// cost and we're done.

75 ///

76 /// We also keep running track of the best solution we've found so far that

77 /// has the fewest overflow characters and the lowest cost. If no solution fits,

78 /// we'll use this one.

79 ///

80 /// As a final escape hatch for pathologically nasty code, after trying some

81 /// fixed maximum number of solve states, we just bail and return the best

82 /// solution found so far.

83 ///

84 /// Even with the above algorithmic optimizations, complex code may still

85 /// require a lot of exploring to find an optimal solution. To make that fast,

86 /// this code is carefully profiled and optimized. If you modify this, make

87 /// sure to test against the benchmark to ensure you don't regress performance.

88 class LineSplitter {

89 final LineWriter _writer;

90

91 /// The list of chunks being split.

92 final List<Chunk> _chunks;

93

94 /// The set of soft rules whose values are being selected.

95 final List<Rule> _rules;

96

97 /// The number of characters of additional indentation to apply to each line.

98 ///

99 /// This is used when formatting blocks to get the output into the right

100 /// column based on where the block appears.

101 final int _blockIndentation;

102

103 /// The starting column of the first line.

104 final int _firstLineIndent;

105

106 /// The list of solve states to explore further.

107 ///

108 /// This is sorted lowest-cost first. This ensures that as soon as we find a

109 /// solution that fits in the page, we know it will be the lowest cost one

110 /// and can stop looking.

111 final _workList = new HeapPriorityQueue<SolveState>();

112

113 /// The lowest cost solution found so far.

114 SolveState _bestSolution;

115

116 /// Creates a new splitter for [_writer] that tries to fit [chunks] into the

117 /// page width.

118 LineSplitter(this._writer, List<Chunk> chunks, int blockIndentation,

119 int firstLineIndent,

120 {bool flushLeft: false})

121 : _chunks = chunks,

122 // Collect the set of soft rules that we need to select values for.

123 _rules = chunks

124 .map((chunk) => chunk.rule)

125 .where((rule) => rule != null && rule is! HardSplitRule)

126 .toSet()

127 .toList(growable: false),

128 _blockIndentation = blockIndentation,

129 _firstLineIndent = flushLeft ? 0 : firstLineIndent + blockIndentation {

130 // Store the rule's index in the rule so we can get from a chunk to a rule

131 // index quickly.

132 for (var i = 0; i < _rules.length; i++) {

133 _rules[i].index = i;

134 }

135 }

136

137 /// Determine the best way to split the chunks into lines that fit in the

138 /// page, if possible.

139 ///

140 /// Returns a [SplitSet] that defines where each split occurs and the

141 /// indentation of each line.

142 ///

143 /// [firstLineIndent] is the number of characters of whitespace to prefix the

144 /// first line of output with.

145 SplitSet apply() {

146 // Start with a completely unbound, unsplit solution.

147 _workList.add(new SolveState(this, new RuleSet(_rules.length)));

148

149 var attempts = 0;

150 while (!_workList.isEmpty) {

151 var state = _workList.removeFirst();

152

153 if (state.isBetterThan(_bestSolution)) {

154 _bestSolution = state;

155

156 // Since we sort solutions by cost the first solution we find that

157 // fits is the winner.

158 if (_bestSolution.overflowChars == 0) break;

159 }

160

161 if (debug.traceSplitter) {

162 var best = state == _bestSolution ? " (best)" : "";

163 debug.log("$state$best");

164 debug.dumpLines(_chunks, _firstLineIndent, state.splits);

165 debug.log();

166 }

167

168 if (attempts++ > _maxAttempts) break;

169

170 // Try bumping the rule values for rules whose chunks are on long lines.

171 state.expand();

172 }

173

174 if (debug.traceSplitter) {

175 debug.log("$_bestSolution (winner)");

176 debug.dumpLines(_chunks, _firstLineIndent, _bestSolution.splits);

177 debug.log();

178 }

179

180 return _bestSolution.splits;

181 }

182 }

183

184 /// A possibly incomplete solution in the line splitting search space.

185 ///

186 /// A single [SolveState] binds some subset of the rules to values while

187 /// leaving the rest unbound. If every rule is bound, the solve state describes

188 /// a complete solution to the line splitting problem. Even if rules are

189 /// unbound, a state can also usually be used as a solution by treating all

190 /// unbound rules as unsplit. (The usually is because a state that constrains

191 /// an unbound rule to split can't be used with that rule unsplit.)

192 ///

193 /// From a given solve state, we can explore the search tree to more refined

194 /// solve states by producing new ones that add more bound rules to the current

195 /// state.

196 class SolveState implements Comparable<SolveState> {

197 final LineSplitter _splitter;

198 final RuleSet _ruleValues;

199

200 /// The unbound rules in this state that can be bound to produce new more

201 /// refined states.

202 ///

203 /// Keeping this set small is the key to make the entire line splitter

204 /// perform well. If we consider too make rules at each state, our

205 /// exploration of the solution space is too branchy and we waste time on

206 /// dead end solutions.

207 ///

208 /// Here is the key insight. The line splitter treats any unbound rule as

209 /// being unsplit. This means refining a solution always means taking a rule

210 /// that is unsplit and making it split. That monotonically increases the

211 /// cost, but may help fit the solution inside the page.

212 ///

213 /// We want to keep the cost low, so the only reason to consider making a

214 /// rule split is if it reduces an overflowing line. It's also the case that

215 /// splitting an earlier rule will often reshuffle the rest of the line.

216 ///

217 /// Taking that into account, the only rules we consider binding to extend a

218 /// solve state are unbound rules inside the first line that is overflowing.

219 /// Even if a line has dozens of rules, this generally keeps the branching

220 /// down to a few. It also means rules inside lines that already fit are

221 /// never touched.

222 ///

223 /// There is one other set of rules that go in here. Sometimes a bound rule

224 /// in the solve state constrains some other unbound rule to split. In that

225 /// case, we also consider that active so we know to not leave it at zero.

226 final _liveRules = new Set<Rule>();

227

228 /// The set of splits chosen for this state.

229 SplitSet get splits => _splits;

230 SplitSet _splits;

231

232 /// The number of characters that do not fit inside the page with this set of

233 /// splits.

234 int get overflowChars => _overflowChars;

235 int _overflowChars;

236

237 /// Whether we can treat this state as a complete solution by leaving its

238 /// unbound rules unsplit.

239 ///

240 /// This is generally true but will be false if the state contains any

241 /// unbound rules that are constrained to not be zero by other bound rules.

242 /// This avoids picking a solution that leaves those rules at zero when they

243 /// aren't allowed to be.

244 bool _isComplete = true;

245

246 SolveState(this._splitter, this._ruleValues) {

247 _calculateSplits();

248 _calculateCost();

249 }

250

251 /// Orders this state relative to [other].

252 ///

253 /// This is the best-first ordering that the [LineSplitter] uses in its

254 /// worklist. It prefers cheaper states even if they overflow because this

255 /// ensures it finds the best solution first as soon as it finds one that

256 /// fits in the page so it can early out.

257 int compareTo(SolveState other) {

258 // TODO(rnystrom): It may be worth sorting by the estimated lowest number

259 // of overflow characters first. That doesn't help in cases where there is

260 // a solution that fits, but may help in corner cases where there is no

261 // fitting solution.

262

263 if (splits.cost != other.splits.cost) {

264 return splits.cost.compareTo(other.splits.cost);

265 }

266

267 if (overflowChars != other.overflowChars) {

268 return overflowChars.compareTo(other.overflowChars);

269 }

270

271 // Distinguish states based on the rule values just so that states with the

272 // same cost range but different rule values don't get considered identical

273 // by HeapPriorityQueue.

274 for (var rule in _splitter._rules) {

275 var value = _ruleValues.getValue(rule);

276 var otherValue = other._ruleValues.getValue(rule);

277

278 if (value != otherValue) return value.compareTo(otherValue);

279 }

280

281 // If we get here, this state is identical to [other].

282 return 0;

283 }

284

285 /// Returns `true` if this state is a better solution to use as the final

286 /// result than [other].

287 bool isBetterThan(SolveState other) {

288 // If this state contains an unbound rule that we know can't be left

289 // unsplit, we can't pick this as a solution.

290 if (!_isComplete) return false;

291

292 // Anything is better than nothing.

293 if (other == null) return true;

294

295 // Prefer the solution that fits the most in the page.

296 if (overflowChars != other.overflowChars) {

297 return overflowChars < other.overflowChars;

298 }

299

300 // Otherwise, prefer the best cost.

301 return splits.cost < other.splits.cost;

302 }

303

304 /// Enqueues more solve states to consider based on this one.

305 ///

306 /// For each unbound rule in this state that occurred in the first long line,

307 /// enqueue solve states that bind that rule to each value it can have and

308 /// bind all previous rules to zero. (In other words, try all subsolutions

309 /// where that rule becomes the first new rule to split at.)

310 void expand() {

311 var unsplitRules = _ruleValues.clone();

312

313 // Walk down the rules looking for unbound ones to try.

314 var triedRules = 0;

315 for (var rule in _splitter._rules) {

316 if (_liveRules.contains(rule)) {

317 // We found one worth trying, so try all of its values.

318 for (var value = 1; value < rule.numValues; value++) {

319 var boundRules = unsplitRules.clone();

320

321 var mustSplitRules;

322 var valid = boundRules.tryBind(_splitter._rules, rule, value, (rule) {

323 if (mustSplitRules == null) mustSplitRules = [];

324 mustSplitRules.add(rule);

325 });

326

327 // Make sure we don't violate the constraints of the bound rules.

328 if (!valid) continue;

329

330 var state = new SolveState(_splitter, boundRules);

331

332 // If some unbound rules are constrained to split, remember that.

333 if (mustSplitRules != null) {

334 state._isComplete = false;

335 state._liveRules.addAll(mustSplitRules);

336 }

337

338 _splitter._workList.add(state);

339 }

340

341 // Stop once we've tried all of the ones we can.

342 if (++triedRules == _liveRules.length) break;

343 }

344

345 // Fill in previous unbound rules with zero.

346 if (!_ruleValues.contains(rule)) {

347 // Pass a dummy callback because zero will never fail. (If it would

348 // have, that rule would already be bound to some other value.)

349 if (!unsplitRules.tryBind(_splitter._rules, rule, 0, (_) {})) {

350 break;

351 }

352 }

353 }

354 }

355

356 /// Calculates the [SplitSet] for this solve state, assuming any unbound

357 /// rules are set to zero.

358 void _calculateSplits() {

359 // Figure out which expression nesting levels got split and need to be

360 // assigned columns.

361 var usedNestingLevels = new Set();

362 for (var i = 0; i < _splitter._chunks.length - 1; i++) {

363 var chunk = _splitter._chunks[i];

364 if (chunk.rule.isSplit(_getValue(chunk.rule), chunk)) {

365 usedNestingLevels.add(chunk.nesting);

366 chunk.nesting.clearTotalUsedIndent();

367 }

368 }

369

370 for (var nesting in usedNestingLevels) {

371 nesting.refreshTotalUsedIndent(usedNestingLevels);

372 }

373

374 _splits = new SplitSet(_splitter._chunks.length);

375 for (var i = 0; i < _splitter._chunks.length - 1; i++) {

376 var chunk = _splitter._chunks[i];

377 if (chunk.rule.isSplit(_getValue(chunk.rule), chunk)) {

378 var indent = 0;

379 if (!chunk.flushLeftAfter) {

380 // Add in the chunk's indent.

381 indent = _splitter._blockIndentation + chunk.indent;

382

383 // And any expression nesting.

384 indent += chunk.nesting.totalUsedIndent;

385 }

386

387 _splits.add(i, indent);

388 }

389 }

390 }

391

392 /// Gets the value to use for [rule], either the bound value or `0` if it

393 /// isn't bound.

394 int _getValue(Rule rule) {

395 if (rule is HardSplitRule) return 0;

396

397 return _ruleValues.getValue(rule);

398 }

399

400 /// Evaluates the cost (i.e. the relative "badness") of splitting the line

401 /// into [lines] physical lines based on the current set of rules.

402 void _calculateCost() {

403 assert(_splits != null);

404

405 // Calculate the length of each line and apply the cost of any spans that

406 // get split.

407 var cost = 0;

408 _overflowChars = 0;

409

410 var length = _splitter._firstLineIndent;

411

412 // The unbound rules in use by the current line. This will be null after

413 // the first long line has completed.

414 var currentLineRules = [];

415

416 endLine(int end) {

417 // Track lines that went over the length. It is only rules contained in

418 // long lines that we may want to split.

419 if (length > _splitter._writer.pageWidth) {

420 _overflowChars += length - _splitter._writer.pageWidth;

421

422 // Only try rules that are in the first long line, since we know at

423 // least one of them will be split.

424 if (currentLineRules != null && currentLineRules.isNotEmpty) {

425 _liveRules.addAll(currentLineRules);

426 currentLineRules = null;

427 }

428 } else {

429 // The line fit, so don't keep track of its rules.

430 if (currentLineRules != null) {

431 currentLineRules.clear();

432 }

433 }

434 }

435

436 // The set of spans that contain chunks that ended up splitting. We store

437 // these in a set so a span's cost doesn't get double-counted if more than

438 // one split occurs in it.

439 var splitSpans = new Set();

440

441 for (var i = 0; i < _splitter._chunks.length; i++) {

442 var chunk = _splitter._chunks[i];

443

444 length += chunk.text.length;

445

446 // Ignore the split after the last chunk.

447 if (i == _splitter._chunks.length - 1) break;

448

449 if (_splits.shouldSplitAt(i)) {

450 endLine(i);

451

452 splitSpans.addAll(chunk.spans);

453

454 // Include the cost of the nested block.

455 if (chunk.blockChunks.isNotEmpty) {

456 cost +=

457 _splitter._writer.formatBlock(chunk, _splits.getColumn(i)).cost;

458 }

459

460 // Start the new line.

461 length = _splits.getColumn(i);

462 } else {

463 if (chunk.spaceWhenUnsplit) length++;

464

465 // Include the nested block inline, if any.

466 length += chunk.unsplitBlockLength;

467

468 // If we might be in the first overly long line, keep track of any

469 // unbound rules we encounter. These are ones that we'll want to try to

470 // bind to shorten the long line.

471 if (currentLineRules != null &&

472 chunk.rule != null &&

473 !chunk.isHardSplit &&

474 !_ruleValues.contains(chunk.rule)) {

475 currentLineRules.add(chunk.rule);

476 }

477 }

478 }

479

480 // Add the costs for the rules that split.

481 _ruleValues.forEach(_splitter._rules, (rule, value) {

482 // A rule may be bound to zero if another rule constrains it to not split.

483 if (value != 0) cost += rule.cost;

484 });

485

486 // Add the costs for the spans containing splits.

487 for (var span in splitSpans) cost += span.cost;

488

489 // Finish the last line.

490 endLine(_splitter._chunks.length);

491

492 _splits.setCost(cost);

493 }

494

495 String toString() {

496 var buffer = new StringBuffer();

497

498 buffer.writeAll(

499 _splitter._rules.map((rule) {

500 var valueLength = "${rule.fullySplitValue}".length;

501

502 var value = "?";

503 if (_ruleValues.contains(rule)) {

504 value = "${_ruleValues.getValue(rule)}";

505 }

506

507 value = value.padLeft(valueLength);

508 if (_liveRules.contains(rule)) {

509 value = debug.bold(value);

510 } else {

511 value = debug.gray(value);

512 }

513

514 return value;

515 }),

516 " ");

517

518 buffer.write(" \$${splits.cost}");

519

520 if (overflowChars > 0) buffer.write(" (${overflowChars} over)");

521 if (!_isComplete) buffer.write(" (incomplete)");

522 if (splits == null) buffer.write(" invalid");

523

524 return buffer.toString();

525 }

526 }

OLD	NEW

« no previous file with comments | « lib/src/debug.dart ('k') | lib/src/line_splitting/line_splitter.dart » ('j') | no next file with comments »