lib/src/line_splitter.dart - Issue 1257903002: Optimize splitting long, complex lines.

Unified Diff: lib/src/line_splitter.dart

Issue 1257903002: Optimize splitting long, complex lines. (Closed) Base URL: https://github.com/dart-lang/dart_style.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: lib/src/line_splitter.dart

diff --git a/lib/src/line_splitter.dart b/lib/src/line_splitter.dart

deleted file mode 100644

index f89d4f430e3cd197901775332035517a77b24c0b..0000000000000000000000000000000000000000

--- a/lib/src/line_splitter.dart

+++ /dev/null

@@ -1,526 +0,0 @@

-// BSD-style license that can be found in the LICENSE file.

-library dart_style.src.line_splitter;

-import 'package:collection/priority_queue.dart';

-import 'chunk.dart';

-import 'debug.dart' as debug;

-import 'line_writer.dart';

-import 'rule/rule.dart';

-import 'rule_set.dart';

-/// To ensure the solver doesn't go totally pathological on giant code, we cap

-/// it at a fixed number of attempts.

-///

-/// If the optimal solution isn't found after this many tries, it just uses the

-/// best it found so far.

-const _maxAttempts = 50000;

-/// Takes a set of chunks and determines the best values for its rules in order

-/// to fit it inside the page boundary.

-///

-/// This problem is exponential in the number of rules and a single expression

-/// in Dart can be quite large, so it isn't feasible to brute force this. For

-/// example:

-///

-/// outer(

-/// fn(1 + 2, 3 + 4, 5 + 6, 7 + 8),

-/// fn(1 + 2, 3 + 4, 5 + 6, 7 + 8));

-///

-/// There are 509,607,936 ways this can be split.

-///

-/// The problem is even harder because we may not be able to easily tell if a

-/// given solution is the best one. It's possible that there is *no* solution

-/// that fits in the page (due to long strings or identifiers) so the winning

-/// solution may still have overflow characters. This makes it hard to know

-/// when we are done and can stop looking.

-///

-/// There are a couple of pieces of domain knowledge we use to cope with this:

-///

-/// - Changing a rule from unsplit to split will never lower its cost. A

-/// solution with all rules unsplit will always be the one with the lowest

-/// cost (zero). Conversely, setting all of its rules to the maximum split

-/// value will always have the highest cost.

-///

-/// (You might think there is a converse rule about overflow characters. The

-/// solution with the fewest splits will have the most overflow, and the

-/// solution with the most splits will have the least overflow. Alas, because

-/// of indentation, that isn't always the case. Adding a split may *increase*

-/// overflow in some cases.)

-///

-/// - If all of the chunks for a rule are inside lines that already fit in the

-/// page, then splitting that rule will never improve the solution.

-///

-/// We start off with a [SolveState] where all rules are unbound (which

-/// implicitly treats them as unsplit). For a given solve state, we can produce

-/// a set of expanded states that takes some of the rules in the first long

-/// line and bind them to split values. This always produces new solve states

-/// with higher cost (but often fewer overflow characters) than the parent

-/// state.

-///

-/// We take these expanded states and add them to a work list sorted by cost.

-/// Since unsplit rules always have lower cost solutions, we know that no state

-/// we enqueue later will ever have a lower cost than the ones we already have

-/// enqueued.

-///

-/// Then we keep pulling states off the work list and expanding them and adding

-/// the results back into the list. We do this until we hit a solution where

-/// all characters fit in the page. The first one we find will have the lowest

-/// cost and we're done.

-///

-/// We also keep running track of the best solution we've found so far that

-/// has the fewest overflow characters and the lowest cost. If no solution fits,

-/// we'll use this one.

-///

-/// As a final escape hatch for pathologically nasty code, after trying some

-/// fixed maximum number of solve states, we just bail and return the best

-/// solution found so far.

-///

-/// Even with the above algorithmic optimizations, complex code may still

-/// require a lot of exploring to find an optimal solution. To make that fast,

-/// this code is carefully profiled and optimized. If you modify this, make

-/// sure to test against the benchmark to ensure you don't regress performance.

-class LineSplitter {

- final LineWriter _writer;

- /// The list of chunks being split.

- final List<Chunk> _chunks;

- /// The set of soft rules whose values are being selected.

- final List<Rule> _rules;

- /// The number of characters of additional indentation to apply to each line.

- ///

- /// This is used when formatting blocks to get the output into the right

- /// column based on where the block appears.

- final int _blockIndentation;

- /// The starting column of the first line.

- final int _firstLineIndent;

- /// The list of solve states to explore further.

- ///

- /// This is sorted lowest-cost first. This ensures that as soon as we find a

- /// solution that fits in the page, we know it will be the lowest cost one

- /// and can stop looking.

- final _workList = new HeapPriorityQueue<SolveState>();

- /// The lowest cost solution found so far.

- SolveState _bestSolution;

- /// Creates a new splitter for [_writer] that tries to fit [chunks] into the

- /// page width.

- LineSplitter(this._writer, List<Chunk> chunks, int blockIndentation,

- int firstLineIndent,

- {bool flushLeft: false})

- : _chunks = chunks,

- // Collect the set of soft rules that we need to select values for.

- _rules = chunks

- .map((chunk) => chunk.rule)

- .where((rule) => rule != null && rule is! HardSplitRule)

- .toSet()

- .toList(growable: false),

- _blockIndentation = blockIndentation,

- _firstLineIndent = flushLeft ? 0 : firstLineIndent + blockIndentation {

- // Store the rule's index in the rule so we can get from a chunk to a rule

- // index quickly.

- for (var i = 0; i < _rules.length; i++) {

- _rules[i].index = i;

- }

- /// Determine the best way to split the chunks into lines that fit in the

- /// page, if possible.

- ///

- /// Returns a [SplitSet] that defines where each split occurs and the

- /// indentation of each line.

- ///

- /// [firstLineIndent] is the number of characters of whitespace to prefix the

- /// first line of output with.

- SplitSet apply() {

- // Start with a completely unbound, unsplit solution.

- _workList.add(new SolveState(this, new RuleSet(_rules.length)));

- var attempts = 0;

- while (!_workList.isEmpty) {

- var state = _workList.removeFirst();

- if (state.isBetterThan(_bestSolution)) {

- _bestSolution = state;

- // Since we sort solutions by cost the first solution we find that

- // fits is the winner.

- if (_bestSolution.overflowChars == 0) break;

- }

- if (debug.traceSplitter) {

- var best = state == _bestSolution ? " (best)" : "";

- debug.log("$state$best");

- debug.dumpLines(_chunks, _firstLineIndent, state.splits);

- debug.log();

- }

- if (attempts++ > _maxAttempts) break;

- // Try bumping the rule values for rules whose chunks are on long lines.

- state.expand();

- }

- if (debug.traceSplitter) {

- debug.log("$_bestSolution (winner)");

- debug.dumpLines(_chunks, _firstLineIndent, _bestSolution.splits);

- debug.log();

- }

- return _bestSolution.splits;

- }

-/// A possibly incomplete solution in the line splitting search space.

-///

-/// A single [SolveState] binds some subset of the rules to values while

-/// leaving the rest unbound. If every rule is bound, the solve state describes

-/// a complete solution to the line splitting problem. Even if rules are

-/// unbound, a state can also usually be used as a solution by treating all

-/// unbound rules as unsplit. (The usually is because a state that constrains

-/// an unbound rule to split can't be used with that rule unsplit.)

-///

-/// From a given solve state, we can explore the search tree to more refined

-/// solve states by producing new ones that add more bound rules to the current

-/// state.

-class SolveState implements Comparable<SolveState> {

- final LineSplitter _splitter;

- final RuleSet _ruleValues;

- /// The unbound rules in this state that can be bound to produce new more

- /// refined states.

- ///

- /// Keeping this set small is the key to make the entire line splitter

- /// perform well. If we consider too make rules at each state, our

- /// exploration of the solution space is too branchy and we waste time on

- /// dead end solutions.

- ///

- /// Here is the key insight. The line splitter treats any unbound rule as

- /// being unsplit. This means refining a solution always means taking a rule

- /// that is unsplit and making it split. That monotonically increases the

- /// cost, but may help fit the solution inside the page.

- ///

- /// We want to keep the cost low, so the only reason to consider making a

- /// rule split is if it reduces an overflowing line. It's also the case that

- /// splitting an earlier rule will often reshuffle the rest of the line.

- ///

- /// Taking that into account, the only rules we consider binding to extend a

- /// solve state are *unbound rules inside the first line that is overflowing*.

- /// Even if a line has dozens of rules, this generally keeps the branching

- /// down to a few. It also means rules inside lines that already fit are

- /// never touched.

- ///

- /// There is one other set of rules that go in here. Sometimes a bound rule

- /// in the solve state constrains some other unbound rule to split. In that

- /// case, we also consider that active so we know to not leave it at zero.

- final _liveRules = new Set<Rule>();

- /// The set of splits chosen for this state.

- SplitSet get splits => _splits;

- SplitSet _splits;

- /// The number of characters that do not fit inside the page with this set of

- /// splits.

- int get overflowChars => _overflowChars;

- int _overflowChars;

- /// Whether we can treat this state as a complete solution by leaving its

- /// unbound rules unsplit.

- ///

- /// This is generally true but will be false if the state contains any

- /// unbound rules that are constrained to not be zero by other bound rules.

- /// This avoids picking a solution that leaves those rules at zero when they

- /// aren't allowed to be.

- bool _isComplete = true;

- SolveState(this._splitter, this._ruleValues) {

- _calculateSplits();

- _calculateCost();

- }

- /// Orders this state relative to [other].

- ///

- /// This is the best-first ordering that the [LineSplitter] uses in its

- /// worklist. It prefers cheaper states even if they overflow because this

- /// ensures it finds the best solution first as soon as it finds one that

- /// fits in the page so it can early out.

- int compareTo(SolveState other) {

- // TODO(rnystrom): It may be worth sorting by the estimated lowest number

- // of overflow characters first. That doesn't help in cases where there is

- // a solution that fits, but may help in corner cases where there is no

- // fitting solution.

- if (splits.cost != other.splits.cost) {

- return splits.cost.compareTo(other.splits.cost);

- }

- if (overflowChars != other.overflowChars) {

- return overflowChars.compareTo(other.overflowChars);

- }

- // Distinguish states based on the rule values just so that states with the

- // same cost range but different rule values don't get considered identical

- // by HeapPriorityQueue.

- for (var rule in _splitter._rules) {

- var value = _ruleValues.getValue(rule);

- var otherValue = other._ruleValues.getValue(rule);

- if (value != otherValue) return value.compareTo(otherValue);

- }

- // If we get here, this state is identical to [other].

- return 0;

- }

- /// Returns `true` if this state is a better solution to use as the final

- /// result than [other].

- bool isBetterThan(SolveState other) {

- // If this state contains an unbound rule that we know can't be left

- // unsplit, we can't pick this as a solution.

- if (!_isComplete) return false;

- // Anything is better than nothing.

- if (other == null) return true;

- // Prefer the solution that fits the most in the page.

- if (overflowChars != other.overflowChars) {

- return overflowChars < other.overflowChars;

- }

- // Otherwise, prefer the best cost.

- return splits.cost < other.splits.cost;

- }

- /// Enqueues more solve states to consider based on this one.

- ///

- /// For each unbound rule in this state that occurred in the first long line,

- /// enqueue solve states that bind that rule to each value it can have and

- /// bind all previous rules to zero. (In other words, try all subsolutions

- /// where that rule becomes the first new rule to split at.)

- void expand() {

- var unsplitRules = _ruleValues.clone();

- // Walk down the rules looking for unbound ones to try.

- var triedRules = 0;

- for (var rule in _splitter._rules) {

- if (_liveRules.contains(rule)) {

- // We found one worth trying, so try all of its values.

- for (var value = 1; value < rule.numValues; value++) {

- var boundRules = unsplitRules.clone();

- var mustSplitRules;

- var valid = boundRules.tryBind(_splitter._rules, rule, value, (rule) {

- if (mustSplitRules == null) mustSplitRules = [];

- mustSplitRules.add(rule);

- });

- // Make sure we don't violate the constraints of the bound rules.

- if (!valid) continue;

- var state = new SolveState(_splitter, boundRules);

- // If some unbound rules are constrained to split, remember that.

- if (mustSplitRules != null) {

- state._isComplete = false;

- state._liveRules.addAll(mustSplitRules);

- }

- _splitter._workList.add(state);

- }

- // Stop once we've tried all of the ones we can.

- if (++triedRules == _liveRules.length) break;

- }

- // Fill in previous unbound rules with zero.

- if (!_ruleValues.contains(rule)) {

- // Pass a dummy callback because zero will never fail. (If it would

- // have, that rule would already be bound to some other value.)

- if (!unsplitRules.tryBind(_splitter._rules, rule, 0, (_) {})) {

- break;

- }

- /// Calculates the [SplitSet] for this solve state, assuming any unbound

- /// rules are set to zero.

- void _calculateSplits() {

- // Figure out which expression nesting levels got split and need to be

- // assigned columns.

- var usedNestingLevels = new Set();

- for (var i = 0; i < _splitter._chunks.length - 1; i++) {

- var chunk = _splitter._chunks[i];

- if (chunk.rule.isSplit(_getValue(chunk.rule), chunk)) {

- usedNestingLevels.add(chunk.nesting);

- chunk.nesting.clearTotalUsedIndent();

- }

- for (var nesting in usedNestingLevels) {

- nesting.refreshTotalUsedIndent(usedNestingLevels);

- }

- _splits = new SplitSet(_splitter._chunks.length);

- for (var i = 0; i < _splitter._chunks.length - 1; i++) {

- var chunk = _splitter._chunks[i];

- if (chunk.rule.isSplit(_getValue(chunk.rule), chunk)) {

- var indent = 0;

- if (!chunk.flushLeftAfter) {

- // Add in the chunk's indent.

- indent = _splitter._blockIndentation + chunk.indent;

- // And any expression nesting.

- indent += chunk.nesting.totalUsedIndent;

- }

- _splits.add(i, indent);

- }

- /// Gets the value to use for [rule], either the bound value or `0` if it

- /// isn't bound.

- int _getValue(Rule rule) {

- if (rule is HardSplitRule) return 0;

- return _ruleValues.getValue(rule);

- }

- /// Evaluates the cost (i.e. the relative "badness") of splitting the line

- /// into [lines] physical lines based on the current set of rules.

- void _calculateCost() {

- assert(_splits != null);

- // Calculate the length of each line and apply the cost of any spans that

- // get split.

- var cost = 0;

- _overflowChars = 0;

- var length = _splitter._firstLineIndent;

- // The unbound rules in use by the current line. This will be null after

- // the first long line has completed.

- var currentLineRules = [];

- endLine(int end) {

- // Track lines that went over the length. It is only rules contained in

- // long lines that we may want to split.

- if (length > _splitter._writer.pageWidth) {

- _overflowChars += length - _splitter._writer.pageWidth;

- // Only try rules that are in the first long line, since we know at

- // least one of them *will* be split.

- if (currentLineRules != null && currentLineRules.isNotEmpty) {

- _liveRules.addAll(currentLineRules);

- currentLineRules = null;

- }

- } else {

- // The line fit, so don't keep track of its rules.

- if (currentLineRules != null) {

- currentLineRules.clear();

- }

- // The set of spans that contain chunks that ended up splitting. We store

- // these in a set so a span's cost doesn't get double-counted if more than

- // one split occurs in it.

- var splitSpans = new Set();

- for (var i = 0; i < _splitter._chunks.length; i++) {

- var chunk = _splitter._chunks[i];

- length += chunk.text.length;

- // Ignore the split after the last chunk.

- if (i == _splitter._chunks.length - 1) break;

- if (_splits.shouldSplitAt(i)) {

- endLine(i);

- splitSpans.addAll(chunk.spans);

- // Include the cost of the nested block.

- if (chunk.blockChunks.isNotEmpty) {

- cost +=

- _splitter._writer.formatBlock(chunk, _splits.getColumn(i)).cost;

- }

- // Start the new line.

- length = _splits.getColumn(i);

- } else {

- if (chunk.spaceWhenUnsplit) length++;

- // Include the nested block inline, if any.

- length += chunk.unsplitBlockLength;

- // If we might be in the first overly long line, keep track of any

- // unbound rules we encounter. These are ones that we'll want to try to

- // bind to shorten the long line.

- if (currentLineRules != null &&

- chunk.rule != null &&

- !chunk.isHardSplit &&

- !_ruleValues.contains(chunk.rule)) {

- currentLineRules.add(chunk.rule);

- }

- // Add the costs for the rules that split.

- _ruleValues.forEach(_splitter._rules, (rule, value) {

- // A rule may be bound to zero if another rule constrains it to not split.

- if (value != 0) cost += rule.cost;

- });

- // Add the costs for the spans containing splits.

- for (var span in splitSpans) cost += span.cost;

- // Finish the last line.

- endLine(_splitter._chunks.length);

- _splits.setCost(cost);

- }

- String toString() {

- var buffer = new StringBuffer();

- buffer.writeAll(

- _splitter._rules.map((rule) {

- var valueLength = "${rule.fullySplitValue}".length;

- var value = "?";

- if (_ruleValues.contains(rule)) {

- value = "${_ruleValues.getValue(rule)}";

- }

- value = value.padLeft(valueLength);

- if (_liveRules.contains(rule)) {

- value = debug.bold(value);

- } else {

- value = debug.gray(value);

- }

- return value;

- }),

- " ");

- buffer.write(" \$${splits.cost}");

- if (overflowChars > 0) buffer.write(" (${overflowChars} over)");

- if (!_isComplete) buffer.write(" (incomplete)");

- if (splits == null) buffer.write(" invalid");

- return buffer.toString();

- }

« no previous file with comments | « lib/src/debug.dart ('k') | lib/src/line_splitting/line_splitter.dart » ('j') | no next file with comments »