OLD | NEW |
---|---|
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/core/distiller.h" | 5 #include "components/dom_distiller/core/distiller.h" |
6 | 6 |
7 #include <map> | 7 #include <map> |
8 | 8 |
9 #include "base/bind.h" | 9 #include "base/bind.h" |
10 #include "base/callback.h" | 10 #include "base/callback.h" |
11 #include "base/location.h" | |
12 #include "base/message_loop/message_loop.h" | |
13 #include "base/strings/string_number_conversions.h" | |
11 #include "base/strings/stringprintf.h" | 14 #include "base/strings/stringprintf.h" |
12 #include "base/strings/utf_string_conversions.h" | 15 #include "base/strings/utf_string_conversions.h" |
13 #include "base/values.h" | 16 #include "base/values.h" |
14 #include "components/dom_distiller/core/distiller_page.h" | 17 #include "components/dom_distiller/core/distiller_page.h" |
15 #include "components/dom_distiller/core/distiller_url_fetcher.h" | 18 #include "components/dom_distiller/core/distiller_url_fetcher.h" |
19 #include "components/dom_distiller/core/proto/distilled_article.pb.h" | |
16 #include "components/dom_distiller/core/proto/distilled_page.pb.h" | 20 #include "components/dom_distiller/core/proto/distilled_page.pb.h" |
17 #include "grit/dom_distiller_resources.h" | 21 #include "grit/dom_distiller_resources.h" |
18 #include "net/url_request/url_request_context_getter.h" | 22 #include "net/url_request/url_request_context_getter.h" |
19 #include "ui/base/resource/resource_bundle.h" | 23 #include "ui/base/resource/resource_bundle.h" |
20 #include "url/gurl.h" | 24 #include "url/gurl.h" |
21 | 25 |
26 namespace { | |
27 // Maximum number of distilled pages in a article. | |
28 const int kMaxPagesInArticle = 32; | |
29 } | |
30 | |
22 namespace dom_distiller { | 31 namespace dom_distiller { |
23 | 32 |
24 DistillerFactoryImpl::DistillerFactoryImpl( | 33 DistillerFactoryImpl::DistillerFactoryImpl( |
25 scoped_ptr<DistillerPageFactory> distiller_page_factory, | 34 scoped_ptr<DistillerPageFactory> distiller_page_factory, |
26 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) | 35 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) |
27 : distiller_page_factory_(distiller_page_factory.Pass()), | 36 : distiller_page_factory_(distiller_page_factory.Pass()), |
28 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} | 37 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} |
29 | 38 |
30 DistillerFactoryImpl::~DistillerFactoryImpl() {} | 39 DistillerFactoryImpl::~DistillerFactoryImpl() {} |
31 | 40 |
32 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { | 41 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { |
33 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | 42 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( |
34 *distiller_page_factory_, *distiller_url_fetcher_factory_)); | 43 *distiller_page_factory_, *distiller_url_fetcher_factory_)); |
35 distiller->Init(); | 44 distiller->Init(); |
36 return distiller.PassAs<Distiller>(); | 45 return distiller.PassAs<Distiller>(); |
37 } | 46 } |
38 | 47 |
39 DistillerImpl::DistillerImpl( | 48 DistillerImpl::DistillerImpl( |
40 const DistillerPageFactory& distiller_page_factory, | 49 const DistillerPageFactory& distiller_page_factory, |
41 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) | 50 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) |
42 : distiller_page_factory_(distiller_page_factory), | 51 : distiller_page_factory_(distiller_page_factory), |
43 distiller_url_fetcher_factory_(distiller_url_fetcher_factory) { | 52 distiller_url_fetcher_factory_(distiller_url_fetcher_factory), |
53 distillation_in_progress_(false) { | |
44 distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass(); | 54 distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass(); |
45 } | 55 } |
46 | 56 |
47 DistillerImpl::~DistillerImpl() { | 57 DistillerImpl::~DistillerImpl() { |
48 } | 58 } |
49 | 59 |
50 void DistillerImpl::Init() { | 60 void DistillerImpl::Init() { |
61 DCHECK(!distillation_in_progress_); | |
51 distiller_page_->Init(); | 62 distiller_page_->Init(); |
63 article_proto_.reset(new DistilledArticleProto()); | |
52 } | 64 } |
53 | 65 |
54 void DistillerImpl::DistillPage(const GURL& url, | 66 void DistillerImpl::DistillPage(const GURL& url, |
55 const DistillerCallback& distillation_cb) { | 67 const DistillerCallback& distillation_cb) { |
68 DCHECK(!distillation_in_progress_); | |
56 distillation_cb_ = distillation_cb; | 69 distillation_cb_ = distillation_cb; |
57 proto_.reset(new DistilledPageProto()); | 70 DistillNextPage(url); |
58 proto_->set_url(url.spec()); | 71 } |
72 | |
73 void DistillerImpl::DistillNextPage(const GURL& url) { | |
74 DCHECK(!distillation_in_progress_); | |
75 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && | |
76 processed_urls_.find(url.spec()) == processed_urls_.end()) { | |
77 distillation_in_progress_ = true; | |
78 // Distill the next page. | |
79 base::MessageLoop::current()->PostTask( | |
80 FROM_HERE, | |
81 base::Bind( | |
82 &DistillerImpl::AddAndDistillPage, base::Unretained(this), url)); | |
83 } else { | |
84 DistillationTaskComplete(); | |
85 } | |
86 } | |
87 | |
88 void DistillerImpl::AddAndDistillPage(const GURL& url) { | |
89 DCHECK(distillation_in_progress_); | |
90 DCHECK(url.is_valid()); | |
91 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); | |
92 DistilledPageProto* page_proto = article_proto_->add_pages(); | |
Yaron
2014/01/29 20:03:41
Why do you add the page before distillation occurs
shashi
2014/01/29 22:51:37
Done. No, it was just inline with previous code th
| |
93 page_proto->set_url(url.spec()); | |
59 LoadURL(url); | 94 LoadURL(url); |
60 } | 95 } |
61 | 96 |
62 void DistillerImpl::LoadURL(const GURL& url) { | 97 void DistillerImpl::LoadURL(const GURL& url) { |
63 distiller_page_->LoadURL(url); | 98 distiller_page_->LoadURL(url); |
64 } | 99 } |
65 | 100 |
66 void DistillerImpl::OnLoadURLDone() { | 101 void DistillerImpl::OnLoadURLDone() { |
67 GetDistilledContent(); | 102 GetDistilledContent(); |
68 } | 103 } |
69 | 104 |
70 void DistillerImpl::GetDistilledContent() { | 105 void DistillerImpl::GetDistilledContent() { |
71 std::string script = | 106 std::string script = |
72 ResourceBundle::GetSharedInstance().GetRawDataResource( | 107 ResourceBundle::GetSharedInstance().GetRawDataResource( |
73 IDR_DISTILLER_JS).as_string(); | 108 IDR_DISTILLER_JS).as_string(); |
74 distiller_page_->ExecuteJavaScript(script); | 109 distiller_page_->ExecuteJavaScript(script); |
75 } | 110 } |
76 | 111 |
77 void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) { | 112 void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) { |
113 DCHECK(distillation_in_progress_); | |
114 | |
78 std::string result; | 115 std::string result; |
79 bool fetched_image = false; | |
80 const base::ListValue* result_list = NULL; | 116 const base::ListValue* result_list = NULL; |
81 if (!value->GetAsList(&result_list)) { | 117 if (!value->GetAsList(&result_list)) { |
82 DCHECK(proto_); | 118 distillation_in_progress_ = false; |
83 distillation_cb_.Run(proto_.Pass()); | 119 DistillationTaskComplete(); |
84 return; | 120 return; |
85 } | 121 } |
122 | |
123 int index = article_proto_->pages_size() - 1; | |
124 DCHECK_GE(index, 0); | |
125 | |
126 DistilledPageProto* current_page = GetLastPage(); | |
Yaron
2014/01/29 20:03:41
While we aren't planning to do in-parallel distill
shashi
2014/01/29 22:51:37
Parallel distillation will require creating multip
| |
127 GURL next_page_url; | |
86 int i = 0; | 128 int i = 0; |
87 for (base::ListValue::const_iterator iter = result_list->begin(); | 129 for (base::ListValue::const_iterator iter = result_list->begin(); |
88 iter != result_list->end(); ++iter, ++i) { | 130 iter != result_list->end(); ++iter, ++i) { |
89 std::string item; | 131 std::string item; |
90 (*iter)->GetAsString(&item); | 132 (*iter)->GetAsString(&item); |
91 // The JavaScript returns an array where the first element is the title, | 133 // The JavaScript returns an array where the first element is the title, |
92 // the second element is the article content HTML, and the remaining | 134 // the second element is the article content HTML, and the remaining |
93 // elements are image URLs referenced in the HTML. | 135 // elements are image URLs referenced in the HTML. |
94 switch (i) { | 136 switch (i) { |
95 case 0: | 137 case 0: |
96 proto_->set_title(item); | 138 // Set the title of the article as the title of the first page. |
139 if (article_proto_->pages_size() == 1) | |
140 article_proto_->set_title(item); | |
97 break; | 141 break; |
98 case 1: | 142 case 1: |
99 proto_->set_html(item); | 143 current_page->set_html(item); |
100 break; | 144 break; |
145 case 2: { | |
146 next_page_url = GURL(item); | |
147 if (next_page_url.is_valid()) { | |
148 GURL current_page_url(current_page->url()); | |
149 // The pages should be in same origin. | |
150 DCHECK_EQ(next_page_url.GetOrigin(), current_page_url.GetOrigin()); | |
151 } | |
152 break; | |
153 } | |
101 default: | 154 default: |
102 int image_number = i - 2; | 155 int page_number = article_proto_->pages_size(); |
103 std::string image_id = base::StringPrintf("%d", image_number); | 156 int image_number = i - 3; |
104 FetchImage(image_id, item); | 157 std::string image_id = base::IntToString(page_number) + "_" + |
105 fetched_image = true; | 158 base::IntToString(image_number); |
159 FetchImage(current_page, image_id, item); | |
106 } | 160 } |
107 } | 161 } |
108 if (!fetched_image) | 162 processed_urls_.insert(current_page->url()); |
109 distillation_cb_.Run(proto_.Pass()); | 163 distillation_in_progress_ = false; |
164 DistillNextPage(next_page_url); | |
110 } | 165 } |
111 | 166 |
112 void DistillerImpl::FetchImage(const std::string& image_id, | 167 DistilledPageProto* DistillerImpl::GetLastPage() const { |
168 DCHECK_GT(article_proto_->pages_size(), 0); | |
169 int index = article_proto_->pages_size() - 1; | |
170 DCHECK_GE(index, 0); | |
171 return article_proto_->mutable_pages(index); | |
172 } | |
173 | |
174 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, | |
175 const std::string& image_id, | |
113 const std::string& item) { | 176 const std::string& item) { |
114 DistillerURLFetcher* fetcher = | 177 DistillerURLFetcher* fetcher = |
115 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); | 178 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); |
116 image_fetchers_[image_id] = fetcher; | 179 image_fetchers_[image_id] = fetcher; |
117 fetcher->FetchURL(item, | 180 fetcher->FetchURL(item, |
118 base::Bind(&DistillerImpl::OnFetchImageDone, | 181 base::Bind(&DistillerImpl::OnFetchImageDone, |
119 base::Unretained(this), image_id)); | 182 base::Unretained(this), |
183 base::Unretained(distilled_page_proto), | |
184 image_id)); | |
120 } | 185 } |
121 | 186 |
122 void DistillerImpl::OnFetchImageDone(const std::string& id, | 187 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, |
188 const std::string& id, | |
123 const std::string& response) { | 189 const std::string& response) { |
124 DCHECK(proto_); | 190 DCHECK_GT(article_proto_->pages_size(), 0); |
125 DistilledPageProto_Image* image = proto_->add_image(); | 191 DCHECK(distilled_page_proto); |
192 DistilledPageProto_Image* image = distilled_page_proto->add_image(); | |
126 image->set_name(id); | 193 image->set_name(id); |
127 image->set_data(response); | 194 image->set_data(response); |
128 DCHECK(image_fetchers_.end() != image_fetchers_.find(id)); | 195 DCHECK(image_fetchers_.end() != image_fetchers_.find(id)); |
129 DistillerURLFetcher* fetcher = image_fetchers_[id]; | 196 DistillerURLFetcher* fetcher = image_fetchers_[id]; |
130 int result = image_fetchers_.erase(id); | 197 int result = image_fetchers_.erase(id); |
131 delete fetcher; | 198 delete fetcher; |
132 DCHECK_EQ(1, result); | 199 DCHECK_EQ(1, result); |
133 if (image_fetchers_.empty()) { | 200 DistillationTaskComplete(); |
Yaron
2014/01/29 20:03:41
Naming feels wrong because of this case. I had to
shashi
2014/01/29 22:51:37
Changed, hopefully better.
On 2014/01/29 20:03:41,
| |
134 distillation_cb_.Run(proto_.Pass()); | 201 } |
202 | |
203 void DistillerImpl::DistillationTaskComplete() { | |
204 if (image_fetchers_.empty() && !distillation_in_progress_) { | |
205 distillation_cb_.Run(article_proto_.Pass()); | |
135 } | 206 } |
136 } | 207 } |
137 | 208 |
138 } // namespace dom_distiller | 209 } // namespace dom_distiller |
OLD | NEW |