001 package ezvcard.io.html;
002
003 import java.io.File;
004 import java.io.IOException;
005 import java.io.InputStream;
006 import java.io.Reader;
007 import java.net.MalformedURLException;
008 import java.net.URL;
009 import java.util.ArrayList;
010 import java.util.Iterator;
011 import java.util.List;
012 import java.util.Set;
013
014 import org.jsoup.Jsoup;
015 import org.jsoup.nodes.Document;
016 import org.jsoup.nodes.Element;
017 import org.jsoup.select.Elements;
018
019 import ezvcard.Messages;
020 import ezvcard.VCard;
021 import ezvcard.VCardVersion;
022 import ezvcard.io.CannotParseException;
023 import ezvcard.io.EmbeddedVCardException;
024 import ezvcard.io.SkipMeException;
025 import ezvcard.io.scribe.RawPropertyScribe;
026 import ezvcard.io.scribe.ScribeIndex;
027 import ezvcard.io.scribe.VCardPropertyScribe;
028 import ezvcard.io.scribe.VCardPropertyScribe.Result;
029 import ezvcard.property.Address;
030 import ezvcard.property.Categories;
031 import ezvcard.property.Email;
032 import ezvcard.property.Impp;
033 import ezvcard.property.Label;
034 import ezvcard.property.Nickname;
035 import ezvcard.property.RawProperty;
036 import ezvcard.property.Source;
037 import ezvcard.property.Telephone;
038 import ezvcard.property.Url;
039 import ezvcard.property.VCardProperty;
040 import ezvcard.util.HtmlUtils;
041
042 /*
043 Copyright (c) 2013, Michael Angstadt
044 All rights reserved.
045
046 Redistribution and use in source and binary forms, with or without
047 modification, are permitted provided that the following conditions are met:
048
049 1. Redistributions of source code must retain the above copyright notice, this
050 list of conditions and the following disclaimer.
051 2. Redistributions in binary form must reproduce the above copyright notice,
052 this list of conditions and the following disclaimer in the documentation
053 and/or other materials provided with the distribution.
054
055 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
056 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
057 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
058 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
059 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
060 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
061 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
062 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
063 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
064 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
065
066 The views and conclusions contained in the software and documentation are those
067 of the authors and should not be interpreted as representing official policies,
068 either expressed or implied, of the FreeBSD Project.
069 */
070
071 /**
072 * <p>
073 * Parses {@link VCard} objects from an HTML page (hCard format).
074 * </p>
075 * <p>
076 * <b>Example:</b>
077 *
078 * <pre class="brush:java">
079 * URL url = new URL("http://example.com");
080 * HCardReader hcardReader = new HCardReader(url);
081 * VCard vcard;
082 * while ((vcard = hcardReader.readNext()) != null){
083 * ...
084 * }
085 * </pre>
086 *
087 * </p>
088 * @author Michael Angstadt
089 * @see <a
090 * href="http://microformats.org/wiki/hcard">http://microformats.org/wiki/hcard</a>
091 */
092 public class HCardReader {
093 private ScribeIndex index = new ScribeIndex();
094 private String pageUrl;
095 private final List<String> warnings = new ArrayList<String>();
096 private Elements vcardElements;
097 private Iterator<Element> it;
098 private final List<Label> labels = new ArrayList<Label>();
099 private VCard curVCard;
100 private Elements embeddedVCards = new Elements();
101 private Nickname nickname;
102 private Categories categories;
103
104 private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase();
105 private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase();
106 private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase();
107 private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase();
108
109 /**
110 * Creates a reader that parses hCards from a URL.
111 * @param url the URL of the webpage
112 * @throws IOException if there's a problem loading the webpage
113 */
114 public HCardReader(URL url) throws IOException {
115 pageUrl = url.toString();
116 Document document = Jsoup.parse(url, 30000);
117 init(document, url.getRef());
118 }
119
120 /**
121 * Creates a reader that parses hCards from an input stream.
122 * @param in the input stream to the HTML page
123 * @throws IOException if there's a problem reading the HTML page
124 */
125 public HCardReader(InputStream in) throws IOException {
126 this(in, null);
127 }
128
129 /**
130 * Creates a reader that parses hCards from an input stream.
131 * @param in the input stream to the HTML page
132 * @param pageUrl the original URL of the HTML page
133 * @throws IOException if there's a problem reading the HTML page
134 */
135 public HCardReader(InputStream in, String pageUrl) throws IOException {
136 this.pageUrl = pageUrl;
137 Document document = (pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl);
138 String anchor = getAnchor(pageUrl);
139 init(document, anchor);
140 }
141
142 /**
143 * Creates a reader that parses hCards from a file.
144 * @param file the HTML file
145 * @throws IOException if there's a problem reading the HTML file
146 */
147 public HCardReader(File file) throws IOException {
148 this(file, null);
149 }
150
151 /**
152 * Creates a reader that parses hCards from a file.
153 * @param file the HTML file
154 * @param pageUrl the original URL of the HTML page
155 * @throws IOException if there's a problem reading the HTML file
156 */
157 public HCardReader(File file, String pageUrl) throws IOException {
158 this.pageUrl = pageUrl;
159 Document document = (pageUrl == null) ? Jsoup.parse(file, null, "") : Jsoup.parse(file, null, pageUrl);
160 String anchor = getAnchor(pageUrl);
161 init(document, anchor);
162 }
163
164 /**
165 * Creates a reader that parses hCards from a reader.
166 * @param reader the input stream to the HTML page
167 * @throws IOException if there's a problem reading the HTML page
168 */
169 public HCardReader(Reader reader) throws IOException {
170 this(reader, null);
171 }
172
173 /**
174 * Creates a reader that parses hCards from a reader.
175 * @param reader the input stream to the HTML page
176 * @param pageUrl the original URL of the HTML page
177 * @throws IOException if there's a problem reading the HTML page
178 */
179 public HCardReader(Reader reader, String pageUrl) throws IOException {
180 this.pageUrl = pageUrl;
181
182 StringBuilder sb = new StringBuilder();
183 char buffer[] = new char[4096];
184 int read;
185 while ((read = reader.read(buffer)) != -1) {
186 sb.append(buffer, 0, read);
187 }
188 String html = sb.toString();
189
190 Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
191 String anchor = getAnchor(pageUrl);
192 init(document, anchor);
193 }
194
195 /**
196 * Creates a reader that parses hCards from a string.
197 * @param html the HTML page
198 */
199 public HCardReader(String html) {
200 this(html, null);
201 }
202
203 /**
204 * Creates a reader that parses hCards from a string.
205 * @param html the HTML page
206 * @param pageUrl the original URL of the HTML page
207 */
208 public HCardReader(String html, String pageUrl) {
209 this.pageUrl = pageUrl;
210
211 Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
212 String anchor = getAnchor(pageUrl);
213 init(document, anchor);
214 }
215
216 /**
217 * Constructor for reading embedded vCards.
218 * @param embeddedVCard the HTML element of the embedded vCard
219 * @param pageUrl the original URL of the HTML page
220 */
221 private HCardReader(Element embeddedVCard, String pageUrl) {
222 this.pageUrl = pageUrl;
223 vcardElements = new Elements(embeddedVCard);
224 it = vcardElements.iterator();
225 }
226
227 private void init(Document document, String anchor) {
228 Element searchIn = null;
229 if (anchor != null) {
230 searchIn = document.getElementById(anchor);
231 }
232 if (searchIn == null) {
233 searchIn = document;
234 }
235
236 vcardElements = searchIn.getElementsByClass("vcard");
237 it = vcardElements.iterator();
238 }
239
240 /**
241 * Gets the anchor part of a URL.
242 * @param urlStr the URL
243 * @return the anchor (e.g. "foo" from the URL
244 * "http://example.com/index.php#foo")
245 */
246 private String getAnchor(String urlStr) {
247 if (urlStr == null) {
248 return null;
249 }
250
251 try {
252 URL url = new URL(urlStr);
253 return url.getRef();
254 } catch (MalformedURLException e) {
255 return null;
256 }
257 }
258
259 /**
260 * <p>
261 * Registers a property scribe. This is the same as calling:
262 * </p>
263 * <p>
264 * {@code getScribeIndex().register(scribe)}
265 * </p>
266 * @param scribe the scribe to register
267 */
268 public void registerScribe(VCardPropertyScribe<? extends VCardProperty> scribe) {
269 index.register(scribe);
270 }
271
272 /**
273 * Gets the scribe index.
274 * @return the scribe index
275 */
276 public ScribeIndex getScribeIndex() {
277 return index;
278 }
279
280 /**
281 * Sets the scribe index.
282 * @param index the scribe index
283 */
284 public void setScribeIndex(ScribeIndex index) {
285 this.index = index;
286 }
287
288 /**
289 * Gets the warnings from the last vCard that was unmarshalled. This list is
290 * reset every time a new vCard is read.
291 * @return the warnings or empty list if there were no warnings
292 */
293 public List<String> getWarnings() {
294 return new ArrayList<String>(warnings);
295 }
296
297 /**
298 * Reads the next vCard from the data stream.
299 * @return the next vCard or null if there are no more
300 */
301 public VCard readNext() {
302 Element vcardElement = null;
303 while (it.hasNext() && vcardElement == null) {
304 vcardElement = it.next();
305
306 //if this element is a child of another "vcard" element, then ignore it because it's an embedded vcard
307 if (HtmlUtils.isChildOf(vcardElement, vcardElements)) {
308 vcardElement = null;
309 }
310 }
311 if (vcardElement == null) {
312 return null;
313 }
314
315 warnings.clear();
316 labels.clear();
317 nickname = null;
318 categories = null;
319
320 curVCard = new VCard();
321 curVCard.setVersion(VCardVersion.V3_0);
322 if (pageUrl != null) {
323 curVCard.addSource(new Source(pageUrl));
324 }
325
326 //visit all descendant nodes, depth-first
327 for (Element child : vcardElement.children()) {
328 visit(child);
329 }
330
331 //assign labels to their addresses
332 for (Label label : labels) {
333 boolean orphaned = true;
334 for (Address adr : curVCard.getAddresses()) {
335 if (adr.getLabel() == null && adr.getTypes().equals(label.getTypes())) {
336 adr.setLabel(label.getValue());
337 orphaned = false;
338 break;
339 }
340 }
341 if (orphaned) {
342 curVCard.addOrphanedLabel(label);
343 }
344 }
345
346 return curVCard;
347 }
348
349 private void visit(Element element) {
350 Set<String> classNames = element.classNames();
351 for (String className : classNames) {
352 className = className.toLowerCase();
353
354 if (urlPropertyName.equalsIgnoreCase(className)) {
355 String href = element.attr("href");
356 if (href.length() > 0) {
357 if (!classNames.contains(emailName) && href.matches("(?i)mailto:.*")) {
358 className = emailName;
359 } else if (!classNames.contains(telName) && href.matches("(?i)tel:.*")) {
360 className = telName;
361 } else {
362 //try parsing as IMPP
363 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class);
364 try {
365 Result<? extends VCardProperty> result = scribe.parseHtml(element);
366 curVCard.addProperty(result.getProperty());
367 for (String warning : result.getWarnings()) {
368 addWarning(scribe.getPropertyName(), warning);
369 }
370 continue;
371 } catch (SkipMeException e) {
372 //URL is not an instant messenger URL
373 } catch (CannotParseException e) {
374 //URL is not an instant messenger URL
375 }
376 }
377 }
378 }
379
380 //hCard uses a different name for the CATEGORIES property
381 if ("category".equalsIgnoreCase(className)) {
382 className = categoriesName;
383 }
384
385 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className);
386 if (scribe == null) {
387 //if no scribe is found, and the class name doesn't start with "x-", then it must be an arbitrary CSS class that has nothing to do with vCard
388 if (!className.startsWith("x-")) {
389 continue;
390 }
391 scribe = new RawPropertyScribe(className);
392 }
393
394 VCardProperty property;
395 try {
396 Result<? extends VCardProperty> result = scribe.parseHtml(element);
397
398 for (String warning : result.getWarnings()) {
399 addWarning(className, warning);
400 }
401
402 property = result.getProperty();
403
404 //LABELs must be treated specially so they can be matched up with their ADRs
405 if (property instanceof Label) {
406 labels.add((Label) property);
407 continue;
408 }
409
410 //add all NICKNAMEs to the same type object
411 if (property instanceof Nickname) {
412 Nickname nn = (Nickname) property;
413 if (nickname == null) {
414 nickname = nn;
415 curVCard.addProperty(nickname);
416 } else {
417 nickname.getValues().addAll(nn.getValues());
418 }
419 continue;
420 }
421
422 //add all CATEGORIES to the same type object
423 if (property instanceof Categories) {
424 Categories c = (Categories) property;
425 if (categories == null) {
426 categories = c;
427 curVCard.addProperty(categories);
428 } else {
429 categories.getValues().addAll(c.getValues());
430 }
431 continue;
432 }
433 } catch (SkipMeException e) {
434 addWarning(className, 22, e.getMessage());
435 continue;
436 } catch (CannotParseException e) {
437 String html = element.outerHtml();
438 addWarning(className, 32, html, e.getMessage());
439 property = new RawProperty(className, html);
440 } catch (EmbeddedVCardException e) {
441 if (HtmlUtils.isChildOf(element, embeddedVCards)) {
442 //prevents multiple-nested embedded elements from overwriting each other
443 continue;
444 }
445
446 property = e.getProperty();
447
448 embeddedVCards.add(element);
449 HCardReader embeddedReader = new HCardReader(element, pageUrl);
450 try {
451 VCard embeddedVCard = embeddedReader.readNext();
452 e.injectVCard(embeddedVCard);
453 } finally {
454 for (String w : embeddedReader.getWarnings()) {
455 addWarning(className, 26, w);
456 }
457 }
458 }
459
460 curVCard.addProperty(property);
461 }
462
463 for (Element child : element.children()) {
464 visit(child);
465 }
466 }
467
468 private void addWarning(String propertyName, int code, Object... args) {
469 String message = Messages.INSTANCE.getParseMessage(code, args);
470 addWarning(propertyName, message);
471 }
472
473 private void addWarning(String propertyName, String message) {
474 String warning = Messages.INSTANCE.getParseMessage(35, propertyName, message);
475 warnings.add(warning);
476 }
477 }