Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(98)

Side by Side Diff: components/dom_distiller/core/distiller.cc

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Address Chris' comments. Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/distiller.h" 5 #include "components/dom_distiller/core/distiller.h"
6 6
7 #include <map> 7 #include <map>
8 8
9 #include "base/bind.h" 9 #include "base/bind.h"
10 #include "base/callback.h" 10 #include "base/callback.h"
11 #include "base/strings/stringprintf.h" 11 #include "base/location.h"
12 #include "base/message_loop/message_loop.h"
13 #include "base/strings/string_number_conversions.h"
12 #include "base/strings/utf_string_conversions.h" 14 #include "base/strings/utf_string_conversions.h"
13 #include "base/values.h" 15 #include "base/values.h"
14 #include "components/dom_distiller/core/distiller_page.h" 16 #include "components/dom_distiller/core/distiller_page.h"
15 #include "components/dom_distiller/core/distiller_url_fetcher.h" 17 #include "components/dom_distiller/core/distiller_url_fetcher.h"
18 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
16 #include "components/dom_distiller/core/proto/distilled_page.pb.h" 19 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
17 #include "grit/dom_distiller_resources.h"
18 #include "net/url_request/url_request_context_getter.h" 20 #include "net/url_request/url_request_context_getter.h"
19 #include "ui/base/resource/resource_bundle.h" 21
20 #include "url/gurl.h" 22 namespace {
23 // Maximum number of distilled pages in a article.
cjhopman 2014/02/03 21:47:22 s/ a / an /
shashi 2014/02/03 23:19:29 Done.
24 const int kMaxPagesInArticle = 32;
25 }
21 26
22 namespace dom_distiller { 27 namespace dom_distiller {
23 28
24 DistillerFactoryImpl::DistillerFactoryImpl( 29 DistillerFactoryImpl::DistillerFactoryImpl(
25 scoped_ptr<DistillerPageFactory> distiller_page_factory, 30 scoped_ptr<DistillerPageFactory> distiller_page_factory,
26 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) 31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory)
27 : distiller_page_factory_(distiller_page_factory.Pass()), 32 : distiller_page_factory_(distiller_page_factory.Pass()),
28 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} 33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {}
29 34
30 DistillerFactoryImpl::~DistillerFactoryImpl() {} 35 DistillerFactoryImpl::~DistillerFactoryImpl() {}
31 36
32 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { 37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
33 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( 38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
34 *distiller_page_factory_, *distiller_url_fetcher_factory_)); 39 *distiller_page_factory_, *distiller_url_fetcher_factory_));
35 distiller->Init(); 40 distiller->Init();
36 return distiller.PassAs<Distiller>(); 41 return distiller.PassAs<Distiller>();
37 } 42 }
38 43
39 DistillerImpl::DistillerImpl( 44 DistillerImpl::DistillerImpl(
40 const DistillerPageFactory& distiller_page_factory, 45 const DistillerPageFactory& distiller_page_factory,
41 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) 46 const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
42 : distiller_page_factory_(distiller_page_factory), 47 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
43 distiller_url_fetcher_factory_(distiller_url_fetcher_factory) { 48 distillation_in_progress_(false) {
44 distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass(); 49 page_distiller_.reset(new PageDistiller(distiller_page_factory));
45 } 50 }
46 51
47 DistillerImpl::~DistillerImpl() { 52 DistillerImpl::~DistillerImpl() {
48 } 53 }
49 54
50 void DistillerImpl::Init() { 55 void DistillerImpl::Init() {
51 distiller_page_->Init(); 56 DCHECK(!distillation_in_progress_);
57 page_distiller_->Init();
58 article_proto_.reset(new DistilledArticleProto());
52 } 59 }
53 60
54 void DistillerImpl::DistillPage(const GURL& url, 61 void DistillerImpl::DistillPage(const GURL& url,
55 const DistillerCallback& distillation_cb) { 62 const DistillerCallback& distillation_cb) {
63 DCHECK(!distillation_in_progress_);
56 distillation_cb_ = distillation_cb; 64 distillation_cb_ = distillation_cb;
57 proto_.reset(new DistilledPageProto()); 65 DistillPage(url);
58 proto_->set_url(url.spec());
59 LoadURL(url);
60 } 66 }
61 67
62 void DistillerImpl::LoadURL(const GURL& url) { 68 void DistillerImpl::DistillPage(const GURL& url) {
63 distiller_page_->LoadURL(url); 69 DCHECK(!distillation_in_progress_);
70 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle &&
71 processed_urls_.find(url.spec()) == processed_urls_.end()) {
72 distillation_in_progress_ = true;
73 // Distill the next page.
74 base::MessageLoop::current()->PostTask(
cjhopman 2014/02/03 21:47:22 Do we need to post a task here? Can't we just call
shashi 2014/02/03 23:19:29 I was afraid that it may recurse, because OnPageDi
cjhopman 2014/02/03 23:56:53 Ah, I see now. Now I think that we should either
shashi 2014/02/04 01:39:37 Done.
75 FROM_HERE,
76 base::Bind(
77 &DistillerImpl::DistillNextPage, base::Unretained(this), url));
78 } else {
79 CheckIfAllCallbacksAreFinished();
80 }
64 } 81 }
65 82
66 void DistillerImpl::OnLoadURLDone() { 83 void DistillerImpl::DistillNextPage(const GURL& url) {
67 GetDistilledContent(); 84 DCHECK(distillation_in_progress_);
85 DCHECK(url.is_valid());
86 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle);
87 page_distiller_->DistillPage(
88 url,
89 base::Bind(&DistillerImpl::OnPageDistillationFinished,
90 base::Unretained(this),
91 url));
68 } 92 }
69 93
70 void DistillerImpl::GetDistilledContent() { 94 void DistillerImpl::OnPageDistillationFinished(
71 std::string script = 95 const GURL& page_url,
72 ResourceBundle::GetSharedInstance().GetRawDataResource( 96 const DistilledPageInfo& distilled_page,
73 IDR_DISTILLER_JS).as_string(); 97 bool distillation_successful) {
74 distiller_page_->ExecuteJavaScript(script); 98 DCHECK(distillation_in_progress_);
99 if (!distillation_successful) {
100 CheckIfAllCallbacksAreFinished();
101 } else {
102 DistilledPageProto* current_page = article_proto_->add_pages();
103 // Set the title of the article as the title of the first page.
104 if (article_proto_->pages_size() == 1) {
105 article_proto_->set_title(distilled_page.title);
106 }
107
108 current_page->set_url(page_url.spec());
109 current_page->set_html(distilled_page.html);
110
111 GURL next_page_url(distilled_page.next_page_url);
112 if (next_page_url.is_valid()) {
113 // The pages should be in same origin.
114 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
115 }
116
117 processed_urls_.insert(page_url.spec());
118 distillation_in_progress_ = false;
119 int page_number = article_proto_->pages_size();
120 for (size_t img_num = 0; img_num < distilled_page.image_urls.size();
121 ++img_num) {
122 std::string image_id =
123 base::IntToString(page_number) + "_" + base::IntToString(img_num);
124 FetchImage(current_page, image_id, distilled_page.image_urls[img_num]);
125 }
126 DistillPage(next_page_url);
127 }
75 } 128 }
76 129
77 void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) { 130 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto,
78 std::string result; 131 const std::string& image_id,
79 bool fetched_image = false;
80 const base::ListValue* result_list = NULL;
81 if (!value->GetAsList(&result_list)) {
82 DCHECK(proto_);
83 distillation_cb_.Run(proto_.Pass());
84 return;
85 }
86 int i = 0;
87 for (base::ListValue::const_iterator iter = result_list->begin();
88 iter != result_list->end(); ++iter, ++i) {
89 std::string item;
90 (*iter)->GetAsString(&item);
91 // The JavaScript returns an array where the first element is the title,
92 // the second element is the article content HTML, and the remaining
93 // elements are image URLs referenced in the HTML.
94 switch (i) {
95 case 0:
96 proto_->set_title(item);
97 break;
98 case 1:
99 proto_->set_html(item);
100 break;
101 default:
102 int image_number = i - 2;
103 std::string image_id = base::StringPrintf("%d", image_number);
104 FetchImage(image_id, item);
105 fetched_image = true;
106 }
107 }
108 if (!fetched_image)
109 distillation_cb_.Run(proto_.Pass());
110 }
111
112 void DistillerImpl::FetchImage(const std::string& image_id,
113 const std::string& item) { 132 const std::string& item) {
114 DistillerURLFetcher* fetcher = 133 DistillerURLFetcher* fetcher =
115 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); 134 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
116 image_fetchers_[image_id] = fetcher; 135 image_fetchers_[image_id] = fetcher;
117 fetcher->FetchURL(item, 136 fetcher->FetchURL(item,
118 base::Bind(&DistillerImpl::OnFetchImageDone, 137 base::Bind(&DistillerImpl::OnFetchImageDone,
119 base::Unretained(this), image_id)); 138 base::Unretained(this),
139 base::Unretained(distilled_page_proto),
140 image_id));
120 } 141 }
121 142
122 void DistillerImpl::OnFetchImageDone(const std::string& id, 143 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto,
144 const std::string& id,
123 const std::string& response) { 145 const std::string& response) {
124 DCHECK(proto_); 146 DCHECK_GT(article_proto_->pages_size(), 0);
125 DistilledPageProto_Image* image = proto_->add_image(); 147 DCHECK(distilled_page_proto);
148 DistilledPageProto_Image* image = distilled_page_proto->add_image();
126 image->set_name(id); 149 image->set_name(id);
127 image->set_data(response); 150 image->set_data(response);
128 DCHECK(image_fetchers_.end() != image_fetchers_.find(id)); 151 DCHECK(image_fetchers_.end() != image_fetchers_.find(id));
129 DistillerURLFetcher* fetcher = image_fetchers_[id]; 152 DistillerURLFetcher* fetcher = image_fetchers_[id];
130 int result = image_fetchers_.erase(id); 153 int result = image_fetchers_.erase(id);
131 delete fetcher; 154 delete fetcher;
cjhopman 2014/02/03 21:47:22 It looks like there is a lot going on in this clas
shashi 2014/02/03 23:19:29 Done, filed: http://crbug.com/340431 On 2014/02/0
132 DCHECK_EQ(1, result); 155 DCHECK_EQ(1, result);
133 if (image_fetchers_.empty()) { 156 CheckIfAllCallbacksAreFinished();
134 distillation_cb_.Run(proto_.Pass()); 157 }
158
159 void DistillerImpl::CheckIfAllCallbacksAreFinished() {
cjhopman 2014/02/03 21:47:22 I don't like this function name. I would expect th
shashi 2014/02/03 23:19:29 Done.
160 if (image_fetchers_.empty() && !distillation_in_progress_) {
161 distillation_cb_.Run(article_proto_.Pass());
135 } 162 }
136 } 163 }
137 164
138 } // namespace dom_distiller 165 } // namespace dom_distiller
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698