Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(640)

Side by Side Diff: components/dom_distiller/core/distiller.cc

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/distiller.h" 5 #include "components/dom_distiller/core/distiller.h"
6 6
7 #include <map> 7 #include <map>
8 8
9 #include "base/bind.h" 9 #include "base/bind.h"
10 #include "base/callback.h" 10 #include "base/callback.h"
11 #include "base/location.h"
12 #include "base/message_loop/message_loop.h"
13 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/stringprintf.h" 14 #include "base/strings/stringprintf.h"
12 #include "base/strings/utf_string_conversions.h" 15 #include "base/strings/utf_string_conversions.h"
13 #include "base/values.h" 16 #include "base/values.h"
14 #include "components/dom_distiller/core/distiller_page.h" 17 #include "components/dom_distiller/core/distiller_page.h"
15 #include "components/dom_distiller/core/distiller_url_fetcher.h" 18 #include "components/dom_distiller/core/distiller_url_fetcher.h"
19 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
16 #include "components/dom_distiller/core/proto/distilled_page.pb.h" 20 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
17 #include "grit/dom_distiller_resources.h" 21 #include "grit/dom_distiller_resources.h"
18 #include "net/url_request/url_request_context_getter.h" 22 #include "net/url_request/url_request_context_getter.h"
19 #include "ui/base/resource/resource_bundle.h" 23 #include "ui/base/resource/resource_bundle.h"
20 #include "url/gurl.h" 24 #include "url/gurl.h"
21 25
26 namespace {
27 // Maximum number of distilled pages in a article.
28 const int kMaxPagesInArticle = 32;
29 }
30
22 namespace dom_distiller { 31 namespace dom_distiller {
23 32
24 DistillerFactoryImpl::DistillerFactoryImpl( 33 DistillerFactoryImpl::DistillerFactoryImpl(
25 scoped_ptr<DistillerPageFactory> distiller_page_factory, 34 scoped_ptr<DistillerPageFactory> distiller_page_factory,
26 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) 35 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory)
27 : distiller_page_factory_(distiller_page_factory.Pass()), 36 : distiller_page_factory_(distiller_page_factory.Pass()),
28 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} 37 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {}
29 38
30 DistillerFactoryImpl::~DistillerFactoryImpl() {} 39 DistillerFactoryImpl::~DistillerFactoryImpl() {}
31 40
32 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { 41 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
33 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( 42 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
34 *distiller_page_factory_, *distiller_url_fetcher_factory_)); 43 *distiller_page_factory_, *distiller_url_fetcher_factory_));
35 distiller->Init(); 44 distiller->Init();
36 return distiller.PassAs<Distiller>(); 45 return distiller.PassAs<Distiller>();
37 } 46 }
38 47
39 DistillerImpl::DistillerImpl( 48 DistillerImpl::DistillerImpl(
40 const DistillerPageFactory& distiller_page_factory, 49 const DistillerPageFactory& distiller_page_factory,
41 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) 50 const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
42 : distiller_page_factory_(distiller_page_factory), 51 : distiller_page_factory_(distiller_page_factory),
43 distiller_url_fetcher_factory_(distiller_url_fetcher_factory) { 52 distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
53 distillation_in_progress_(false) {
44 distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass(); 54 distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass();
45 } 55 }
46 56
47 DistillerImpl::~DistillerImpl() { 57 DistillerImpl::~DistillerImpl() {
48 } 58 }
49 59
50 void DistillerImpl::Init() { 60 void DistillerImpl::Init() {
61 DCHECK(!distillation_in_progress_);
51 distiller_page_->Init(); 62 distiller_page_->Init();
63 article_proto_.reset(new DistilledArticleProto());
52 } 64 }
53 65
54 void DistillerImpl::DistillPage(const GURL& url, 66 void DistillerImpl::DistillPage(const GURL& url,
55 const DistillerCallback& distillation_cb) { 67 const DistillerCallback& distillation_cb) {
68 DCHECK(!distillation_in_progress_);
56 distillation_cb_ = distillation_cb; 69 distillation_cb_ = distillation_cb;
57 proto_.reset(new DistilledPageProto()); 70 DistillNextPage(url);
58 proto_->set_url(url.spec()); 71 }
72
73 void DistillerImpl::DistillNextPage(const GURL& url) {
74 DCHECK(!distillation_in_progress_);
75 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle &&
76 processed_urls_.find(url.spec()) == processed_urls_.end()) {
77 distillation_in_progress_ = true;
78 // Distill the next page.
79 base::MessageLoop::current()->PostTask(
80 FROM_HERE,
81 base::Bind(
82 &DistillerImpl::AddAndDistillPage, base::Unretained(this), url));
83 } else {
84 DistillationTaskComplete();
85 }
86 }
87
88 void DistillerImpl::AddAndDistillPage(const GURL& url) {
89 DCHECK(distillation_in_progress_);
90 DCHECK(url.is_valid());
91 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle);
92 DistilledPageProto* page_proto = article_proto_->add_pages();
Yaron 2014/01/29 20:03:41 Why do you add the page before distillation occurs
shashi 2014/01/29 22:51:37 Done. No, it was just inline with previous code th
93 page_proto->set_url(url.spec());
59 LoadURL(url); 94 LoadURL(url);
60 } 95 }
61 96
62 void DistillerImpl::LoadURL(const GURL& url) { 97 void DistillerImpl::LoadURL(const GURL& url) {
63 distiller_page_->LoadURL(url); 98 distiller_page_->LoadURL(url);
64 } 99 }
65 100
66 void DistillerImpl::OnLoadURLDone() { 101 void DistillerImpl::OnLoadURLDone() {
67 GetDistilledContent(); 102 GetDistilledContent();
68 } 103 }
69 104
70 void DistillerImpl::GetDistilledContent() { 105 void DistillerImpl::GetDistilledContent() {
71 std::string script = 106 std::string script =
72 ResourceBundle::GetSharedInstance().GetRawDataResource( 107 ResourceBundle::GetSharedInstance().GetRawDataResource(
73 IDR_DISTILLER_JS).as_string(); 108 IDR_DISTILLER_JS).as_string();
74 distiller_page_->ExecuteJavaScript(script); 109 distiller_page_->ExecuteJavaScript(script);
75 } 110 }
76 111
77 void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) { 112 void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) {
113 DCHECK(distillation_in_progress_);
114
78 std::string result; 115 std::string result;
79 bool fetched_image = false;
80 const base::ListValue* result_list = NULL; 116 const base::ListValue* result_list = NULL;
81 if (!value->GetAsList(&result_list)) { 117 if (!value->GetAsList(&result_list)) {
82 DCHECK(proto_); 118 distillation_in_progress_ = false;
83 distillation_cb_.Run(proto_.Pass()); 119 DistillationTaskComplete();
84 return; 120 return;
85 } 121 }
122
123 int index = article_proto_->pages_size() - 1;
124 DCHECK_GE(index, 0);
125
126 DistilledPageProto* current_page = GetLastPage();
Yaron 2014/01/29 20:03:41 While we aren't planning to do in-parallel distill
shashi 2014/01/29 22:51:37 Parallel distillation will require creating multip
127 GURL next_page_url;
86 int i = 0; 128 int i = 0;
87 for (base::ListValue::const_iterator iter = result_list->begin(); 129 for (base::ListValue::const_iterator iter = result_list->begin();
88 iter != result_list->end(); ++iter, ++i) { 130 iter != result_list->end(); ++iter, ++i) {
89 std::string item; 131 std::string item;
90 (*iter)->GetAsString(&item); 132 (*iter)->GetAsString(&item);
91 // The JavaScript returns an array where the first element is the title, 133 // The JavaScript returns an array where the first element is the title,
92 // the second element is the article content HTML, and the remaining 134 // the second element is the article content HTML, and the remaining
93 // elements are image URLs referenced in the HTML. 135 // elements are image URLs referenced in the HTML.
94 switch (i) { 136 switch (i) {
95 case 0: 137 case 0:
96 proto_->set_title(item); 138 // Set the title of the article as the title of the first page.
139 if (article_proto_->pages_size() == 1)
140 article_proto_->set_title(item);
97 break; 141 break;
98 case 1: 142 case 1:
99 proto_->set_html(item); 143 current_page->set_html(item);
100 break; 144 break;
145 case 2: {
146 next_page_url = GURL(item);
147 if (next_page_url.is_valid()) {
148 GURL current_page_url(current_page->url());
149 // The pages should be in same origin.
150 DCHECK_EQ(next_page_url.GetOrigin(), current_page_url.GetOrigin());
151 }
152 break;
153 }
101 default: 154 default:
102 int image_number = i - 2; 155 int page_number = article_proto_->pages_size();
103 std::string image_id = base::StringPrintf("%d", image_number); 156 int image_number = i - 3;
104 FetchImage(image_id, item); 157 std::string image_id = base::IntToString(page_number) + "_" +
105 fetched_image = true; 158 base::IntToString(image_number);
159 FetchImage(current_page, image_id, item);
106 } 160 }
107 } 161 }
108 if (!fetched_image) 162 processed_urls_.insert(current_page->url());
109 distillation_cb_.Run(proto_.Pass()); 163 distillation_in_progress_ = false;
164 DistillNextPage(next_page_url);
110 } 165 }
111 166
112 void DistillerImpl::FetchImage(const std::string& image_id, 167 DistilledPageProto* DistillerImpl::GetLastPage() const {
168 DCHECK_GT(article_proto_->pages_size(), 0);
169 int index = article_proto_->pages_size() - 1;
170 DCHECK_GE(index, 0);
171 return article_proto_->mutable_pages(index);
172 }
173
174 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto,
175 const std::string& image_id,
113 const std::string& item) { 176 const std::string& item) {
114 DistillerURLFetcher* fetcher = 177 DistillerURLFetcher* fetcher =
115 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); 178 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
116 image_fetchers_[image_id] = fetcher; 179 image_fetchers_[image_id] = fetcher;
117 fetcher->FetchURL(item, 180 fetcher->FetchURL(item,
118 base::Bind(&DistillerImpl::OnFetchImageDone, 181 base::Bind(&DistillerImpl::OnFetchImageDone,
119 base::Unretained(this), image_id)); 182 base::Unretained(this),
183 base::Unretained(distilled_page_proto),
184 image_id));
120 } 185 }
121 186
122 void DistillerImpl::OnFetchImageDone(const std::string& id, 187 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto,
188 const std::string& id,
123 const std::string& response) { 189 const std::string& response) {
124 DCHECK(proto_); 190 DCHECK_GT(article_proto_->pages_size(), 0);
125 DistilledPageProto_Image* image = proto_->add_image(); 191 DCHECK(distilled_page_proto);
192 DistilledPageProto_Image* image = distilled_page_proto->add_image();
126 image->set_name(id); 193 image->set_name(id);
127 image->set_data(response); 194 image->set_data(response);
128 DCHECK(image_fetchers_.end() != image_fetchers_.find(id)); 195 DCHECK(image_fetchers_.end() != image_fetchers_.find(id));
129 DistillerURLFetcher* fetcher = image_fetchers_[id]; 196 DistillerURLFetcher* fetcher = image_fetchers_[id];
130 int result = image_fetchers_.erase(id); 197 int result = image_fetchers_.erase(id);
131 delete fetcher; 198 delete fetcher;
132 DCHECK_EQ(1, result); 199 DCHECK_EQ(1, result);
133 if (image_fetchers_.empty()) { 200 DistillationTaskComplete();
Yaron 2014/01/29 20:03:41 Naming feels wrong because of this case. I had to
shashi 2014/01/29 22:51:37 Changed, hopefully better. On 2014/01/29 20:03:41,
134 distillation_cb_.Run(proto_.Pass()); 201 }
202
203 void DistillerImpl::DistillationTaskComplete() {
204 if (image_fetchers_.empty() && !distillation_in_progress_) {
205 distillation_cb_.Run(article_proto_.Pass());
135 } 206 }
136 } 207 }
137 208
138 } // namespace dom_distiller 209 } // namespace dom_distiller
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698