001    package ezvcard.io.html;
002    
003    import java.io.File;
004    import java.io.IOException;
005    import java.io.InputStream;
006    import java.io.Reader;
007    import java.net.MalformedURLException;
008    import java.net.URL;
009    import java.util.ArrayList;
010    import java.util.Iterator;
011    import java.util.List;
012    import java.util.Set;
013    
014    import org.jsoup.Jsoup;
015    import org.jsoup.nodes.Document;
016    import org.jsoup.nodes.Element;
017    import org.jsoup.select.Elements;
018    
019    import ezvcard.Messages;
020    import ezvcard.VCard;
021    import ezvcard.VCardVersion;
022    import ezvcard.io.CannotParseException;
023    import ezvcard.io.EmbeddedVCardException;
024    import ezvcard.io.SkipMeException;
025    import ezvcard.io.scribe.RawPropertyScribe;
026    import ezvcard.io.scribe.ScribeIndex;
027    import ezvcard.io.scribe.VCardPropertyScribe;
028    import ezvcard.io.scribe.VCardPropertyScribe.Result;
029    import ezvcard.property.Address;
030    import ezvcard.property.Categories;
031    import ezvcard.property.Email;
032    import ezvcard.property.Impp;
033    import ezvcard.property.Label;
034    import ezvcard.property.Nickname;
035    import ezvcard.property.RawProperty;
036    import ezvcard.property.Source;
037    import ezvcard.property.Telephone;
038    import ezvcard.property.Url;
039    import ezvcard.property.VCardProperty;
040    import ezvcard.util.HtmlUtils;
041    
042    /*
043     Copyright (c) 2013, Michael Angstadt
044     All rights reserved.
045    
046     Redistribution and use in source and binary forms, with or without
047     modification, are permitted provided that the following conditions are met: 
048    
049     1. Redistributions of source code must retain the above copyright notice, this
050     list of conditions and the following disclaimer. 
051     2. Redistributions in binary form must reproduce the above copyright notice,
052     this list of conditions and the following disclaimer in the documentation
053     and/or other materials provided with the distribution. 
054    
055     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
056     ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
057     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
058     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
059     ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
060     (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
061     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
062     ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
063     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
064     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
065    
066     The views and conclusions contained in the software and documentation are those
067     of the authors and should not be interpreted as representing official policies, 
068     either expressed or implied, of the FreeBSD Project.
069     */
070    
071    /**
072     * <p>
073     * Parses {@link VCard} objects from an HTML page (hCard format).
074     * </p>
075     * <p>
076     * <b>Example:</b>
077     * 
078     * <pre class="brush:java">
079     * URL url = new URL("http://example.com");
080     * HCardReader hcardReader = new HCardReader(url);
081     * VCard vcard;
082     * while ((vcard = hcardReader.readNext()) != null){
083     *   ...
084     * }
085     * </pre>
086     * 
087     * </p>
088     * @author Michael Angstadt
089     * @see <a
090     * href="http://microformats.org/wiki/hcard">http://microformats.org/wiki/hcard</a>
091     */
092    public class HCardReader {
093            private ScribeIndex index = new ScribeIndex();
094            private String pageUrl;
095            private final List<String> warnings = new ArrayList<String>();
096            private Elements vcardElements;
097            private Iterator<Element> it;
098            private final List<Label> labels = new ArrayList<Label>();
099            private VCard curVCard;
100            private Elements embeddedVCards = new Elements();
101            private Nickname nickname;
102            private Categories categories;
103    
104            private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase();
105            private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase();
106            private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase();
107            private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase();
108    
109            /**
110             * Creates a reader that parses hCards from a URL.
111             * @param url the URL of the webpage
112             * @throws IOException if there's a problem loading the webpage
113             */
114            public HCardReader(URL url) throws IOException {
115                    pageUrl = url.toString();
116                    Document document = Jsoup.parse(url, 30000);
117                    init(document, url.getRef());
118            }
119    
120            /**
121             * Creates a reader that parses hCards from an input stream.
122             * @param in the input stream to the HTML page
123             * @throws IOException if there's a problem reading the HTML page
124             */
125            public HCardReader(InputStream in) throws IOException {
126                    this(in, null);
127            }
128    
129            /**
130             * Creates a reader that parses hCards from an input stream.
131             * @param in the input stream to the HTML page
132             * @param pageUrl the original URL of the HTML page
133             * @throws IOException if there's a problem reading the HTML page
134             */
135            public HCardReader(InputStream in, String pageUrl) throws IOException {
136                    this.pageUrl = pageUrl;
137                    Document document = (pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl);
138                    String anchor = getAnchor(pageUrl);
139                    init(document, anchor);
140            }
141    
142            /**
143             * Creates a reader that parses hCards from a file.
144             * @param file the HTML file
145             * @throws IOException if there's a problem reading the HTML file
146             */
147            public HCardReader(File file) throws IOException {
148                    this(file, null);
149            }
150    
151            /**
152             * Creates a reader that parses hCards from a file.
153             * @param file the HTML file
154             * @param pageUrl the original URL of the HTML page
155             * @throws IOException if there's a problem reading the HTML file
156             */
157            public HCardReader(File file, String pageUrl) throws IOException {
158                    this.pageUrl = pageUrl;
159                    Document document = (pageUrl == null) ? Jsoup.parse(file, null, "") : Jsoup.parse(file, null, pageUrl);
160                    String anchor = getAnchor(pageUrl);
161                    init(document, anchor);
162            }
163    
164            /**
165             * Creates a reader that parses hCards from a reader.
166             * @param reader the input stream to the HTML page
167             * @throws IOException if there's a problem reading the HTML page
168             */
169            public HCardReader(Reader reader) throws IOException {
170                    this(reader, null);
171            }
172    
173            /**
174             * Creates a reader that parses hCards from a reader.
175             * @param reader the input stream to the HTML page
176             * @param pageUrl the original URL of the HTML page
177             * @throws IOException if there's a problem reading the HTML page
178             */
179            public HCardReader(Reader reader, String pageUrl) throws IOException {
180                    this.pageUrl = pageUrl;
181    
182                    StringBuilder sb = new StringBuilder();
183                    char buffer[] = new char[4096];
184                    int read;
185                    while ((read = reader.read(buffer)) != -1) {
186                            sb.append(buffer, 0, read);
187                    }
188                    String html = sb.toString();
189    
190                    Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
191                    String anchor = getAnchor(pageUrl);
192                    init(document, anchor);
193            }
194    
195            /**
196             * Creates a reader that parses hCards from a string.
197             * @param html the HTML page
198             */
199            public HCardReader(String html) {
200                    this(html, null);
201            }
202    
203            /**
204             * Creates a reader that parses hCards from a string.
205             * @param html the HTML page
206             * @param pageUrl the original URL of the HTML page
207             */
208            public HCardReader(String html, String pageUrl) {
209                    this.pageUrl = pageUrl;
210    
211                    Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
212                    String anchor = getAnchor(pageUrl);
213                    init(document, anchor);
214            }
215    
216            /**
217             * Constructor for reading embedded vCards.
218             * @param embeddedVCard the HTML element of the embedded vCard
219             * @param pageUrl the original URL of the HTML page
220             */
221            private HCardReader(Element embeddedVCard, String pageUrl) {
222                    this.pageUrl = pageUrl;
223                    vcardElements = new Elements(embeddedVCard);
224                    it = vcardElements.iterator();
225            }
226    
227            private void init(Document document, String anchor) {
228                    Element searchIn = null;
229                    if (anchor != null) {
230                            searchIn = document.getElementById(anchor);
231                    }
232                    if (searchIn == null) {
233                            searchIn = document;
234                    }
235    
236                    vcardElements = searchIn.getElementsByClass("vcard");
237                    it = vcardElements.iterator();
238            }
239    
240            /**
241             * Gets the anchor part of a URL.
242             * @param urlStr the URL
243             * @return the anchor (e.g. "foo" from the URL
244             * "http://example.com/index.php#foo")
245             */
246            private String getAnchor(String urlStr) {
247                    if (urlStr == null) {
248                            return null;
249                    }
250    
251                    try {
252                            URL url = new URL(urlStr);
253                            return url.getRef();
254                    } catch (MalformedURLException e) {
255                            return null;
256                    }
257            }
258    
259            /**
260             * <p>
261             * Registers a property scribe. This is the same as calling:
262             * </p>
263             * <p>
264             * {@code getScribeIndex().register(scribe)}
265             * </p>
266             * @param scribe the scribe to register
267             */
268            public void registerScribe(VCardPropertyScribe<? extends VCardProperty> scribe) {
269                    index.register(scribe);
270            }
271    
272            /**
273             * Gets the scribe index.
274             * @return the scribe index
275             */
276            public ScribeIndex getScribeIndex() {
277                    return index;
278            }
279    
280            /**
281             * Sets the scribe index.
282             * @param index the scribe index
283             */
284            public void setScribeIndex(ScribeIndex index) {
285                    this.index = index;
286            }
287    
288            /**
289             * Gets the warnings from the last vCard that was unmarshalled. This list is
290             * reset every time a new vCard is read.
291             * @return the warnings or empty list if there were no warnings
292             */
293            public List<String> getWarnings() {
294                    return new ArrayList<String>(warnings);
295            }
296    
297            /**
298             * Reads the next vCard from the data stream.
299             * @return the next vCard or null if there are no more
300             */
301            public VCard readNext() {
302                    Element vcardElement = null;
303                    while (it.hasNext() && vcardElement == null) {
304                            vcardElement = it.next();
305    
306                            //if this element is a child of another "vcard" element, then ignore it because it's an embedded vcard
307                            if (HtmlUtils.isChildOf(vcardElement, vcardElements)) {
308                                    vcardElement = null;
309                            }
310                    }
311                    if (vcardElement == null) {
312                            return null;
313                    }
314    
315                    warnings.clear();
316                    labels.clear();
317                    nickname = null;
318                    categories = null;
319    
320                    curVCard = new VCard();
321                    curVCard.setVersion(VCardVersion.V3_0);
322                    if (pageUrl != null) {
323                            curVCard.addSource(new Source(pageUrl));
324                    }
325    
326                    //visit all descendant nodes, depth-first
327                    for (Element child : vcardElement.children()) {
328                            visit(child);
329                    }
330    
331                    //assign labels to their addresses
332                    for (Label label : labels) {
333                            boolean orphaned = true;
334                            for (Address adr : curVCard.getAddresses()) {
335                                    if (adr.getLabel() == null && adr.getTypes().equals(label.getTypes())) {
336                                            adr.setLabel(label.getValue());
337                                            orphaned = false;
338                                            break;
339                                    }
340                            }
341                            if (orphaned) {
342                                    curVCard.addOrphanedLabel(label);
343                            }
344                    }
345    
346                    return curVCard;
347            }
348    
349            private void visit(Element element) {
350                    Set<String> classNames = element.classNames();
351                    for (String className : classNames) {
352                            className = className.toLowerCase();
353    
354                            if (urlPropertyName.equalsIgnoreCase(className)) {
355                                    String href = element.attr("href");
356                                    if (href.length() > 0) {
357                                            if (!classNames.contains(emailName) && href.matches("(?i)mailto:.*")) {
358                                                    className = emailName;
359                                            } else if (!classNames.contains(telName) && href.matches("(?i)tel:.*")) {
360                                                    className = telName;
361                                            } else {
362                                                    //try parsing as IMPP
363                                                    VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class);
364                                                    try {
365                                                            Result<? extends VCardProperty> result = scribe.parseHtml(element);
366                                                            curVCard.addProperty(result.getProperty());
367                                                            for (String warning : result.getWarnings()) {
368                                                                    addWarning(scribe.getPropertyName(), warning);
369                                                            }
370                                                            continue;
371                                                    } catch (SkipMeException e) {
372                                                            //URL is not an instant messenger URL
373                                                    } catch (CannotParseException e) {
374                                                            //URL is not an instant messenger URL
375                                                    }
376                                            }
377                                    }
378                            }
379    
380                            //hCard uses a different name for the CATEGORIES property
381                            if ("category".equalsIgnoreCase(className)) {
382                                    className = categoriesName;
383                            }
384    
385                            VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className);
386                            if (scribe == null) {
387                                    //if no scribe is found, and the class name doesn't start with "x-", then it must be an arbitrary CSS class that has nothing to do with vCard
388                                    if (!className.startsWith("x-")) {
389                                            continue;
390                                    }
391                                    scribe = new RawPropertyScribe(className);
392                            }
393    
394                            VCardProperty property;
395                            try {
396                                    Result<? extends VCardProperty> result = scribe.parseHtml(element);
397    
398                                    for (String warning : result.getWarnings()) {
399                                            addWarning(className, warning);
400                                    }
401    
402                                    property = result.getProperty();
403    
404                                    //LABELs must be treated specially so they can be matched up with their ADRs
405                                    if (property instanceof Label) {
406                                            labels.add((Label) property);
407                                            continue;
408                                    }
409    
410                                    //add all NICKNAMEs to the same type object
411                                    if (property instanceof Nickname) {
412                                            Nickname nn = (Nickname) property;
413                                            if (nickname == null) {
414                                                    nickname = nn;
415                                                    curVCard.addProperty(nickname);
416                                            } else {
417                                                    nickname.getValues().addAll(nn.getValues());
418                                            }
419                                            continue;
420                                    }
421    
422                                    //add all CATEGORIES to the same type object
423                                    if (property instanceof Categories) {
424                                            Categories c = (Categories) property;
425                                            if (categories == null) {
426                                                    categories = c;
427                                                    curVCard.addProperty(categories);
428                                            } else {
429                                                    categories.getValues().addAll(c.getValues());
430                                            }
431                                            continue;
432                                    }
433                            } catch (SkipMeException e) {
434                                    addWarning(className, 22, e.getMessage());
435                                    continue;
436                            } catch (CannotParseException e) {
437                                    String html = element.outerHtml();
438                                    addWarning(className, 32, html, e.getMessage());
439                                    property = new RawProperty(className, html);
440                            } catch (EmbeddedVCardException e) {
441                                    if (HtmlUtils.isChildOf(element, embeddedVCards)) {
442                                            //prevents multiple-nested embedded elements from overwriting each other
443                                            continue;
444                                    }
445    
446                                    property = e.getProperty();
447    
448                                    embeddedVCards.add(element);
449                                    HCardReader embeddedReader = new HCardReader(element, pageUrl);
450                                    try {
451                                            VCard embeddedVCard = embeddedReader.readNext();
452                                            e.injectVCard(embeddedVCard);
453                                    } finally {
454                                            for (String w : embeddedReader.getWarnings()) {
455                                                    addWarning(className, 26, w);
456                                            }
457                                    }
458                            }
459    
460                            curVCard.addProperty(property);
461                    }
462    
463                    for (Element child : element.children()) {
464                            visit(child);
465                    }
466            }
467    
468            private void addWarning(String propertyName, int code, Object... args) {
469                    String message = Messages.INSTANCE.getParseMessage(code, args);
470                    addWarning(propertyName, message);
471            }
472    
473            private void addWarning(String propertyName, String message) {
474                    String warning = Messages.INSTANCE.getParseMessage(35, propertyName, message);
475                    warnings.add(warning);
476            }
477    }