001    package ezvcard.io;
002    
003    import java.io.File;
004    import java.io.IOException;
005    import java.io.InputStream;
006    import java.io.Reader;
007    import java.lang.reflect.Method;
008    import java.net.MalformedURLException;
009    import java.net.URL;
010    import java.util.ArrayList;
011    import java.util.HashMap;
012    import java.util.Iterator;
013    import java.util.List;
014    import java.util.Map;
015    import java.util.Set;
016    
017    import org.jsoup.Jsoup;
018    import org.jsoup.nodes.Document;
019    import org.jsoup.nodes.Element;
020    import org.jsoup.select.Elements;
021    
022    import ezvcard.VCard;
023    import ezvcard.VCardVersion;
024    import ezvcard.types.AddressType;
025    import ezvcard.types.CategoriesType;
026    import ezvcard.types.EmailType;
027    import ezvcard.types.ImppType;
028    import ezvcard.types.LabelType;
029    import ezvcard.types.NicknameType;
030    import ezvcard.types.RawType;
031    import ezvcard.types.SourceType;
032    import ezvcard.types.TelephoneType;
033    import ezvcard.types.TypeList;
034    import ezvcard.types.UrlType;
035    import ezvcard.types.VCardType;
036    import ezvcard.util.HtmlUtils;
037    
038    /*
039     Copyright (c) 2012, Michael Angstadt
040     All rights reserved.
041    
042     Redistribution and use in source and binary forms, with or without
043     modification, are permitted provided that the following conditions are met: 
044    
045     1. Redistributions of source code must retain the above copyright notice, this
046     list of conditions and the following disclaimer. 
047     2. Redistributions in binary form must reproduce the above copyright notice,
048     this list of conditions and the following disclaimer in the documentation
049     and/or other materials provided with the distribution. 
050    
051     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
052     ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
053     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
054     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
055     ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
056     (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
057     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
058     ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
059     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
060     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
061    
062     The views and conclusions contained in the software and documentation are those
063     of the authors and should not be interpreted as representing official policies, 
064     either expressed or implied, of the FreeBSD Project.
065     */
066    
067    /**
068     * Reads vCards encoded in HTML (hCard format).
069     * @author Michael Angstadt
070     * @see <a
071     * href="http://microformats.org/wiki/hcard">http://microformats.org/wiki/hcard</a>
072     */
073    public class HCardReader implements IParser {
074            protected String pageUrl;
075            protected List<String> warnings = new ArrayList<String>();
076            protected Map<String, Class<? extends VCardType>> extendedTypeClasses = new HashMap<String, Class<? extends VCardType>>();
077            protected Elements vcardElements;
078            protected Iterator<Element> it;
079            protected List<LabelType> labels = new ArrayList<LabelType>();
080            protected List<String> warningsBuffer = new ArrayList<String>();
081            protected VCard curVCard;
082            protected Elements embeddedVCards = new Elements();
083            protected NicknameType nickname;
084            protected CategoriesType categories;
085    
086            /**
087             * @param url the URL of the webpage
088             * @throws IOException if there's a problem loading the webpage
089             */
090            public HCardReader(URL url) throws IOException {
091                    pageUrl = url.toString();
092                    Document document = Jsoup.parse(url, 30000);
093                    init(document, url.getRef());
094            }
095    
096            /**
097             * @param in the input stream to the HTML page
098             * @throws IOException if there's a problem reading the HTML page
099             */
100            public HCardReader(InputStream in) throws IOException {
101                    this(in, null);
102            }
103    
104            /**
105             * @param in the input stream to the HTML page
106             * @param pageUrl the original URL of the HTML page
107             * @throws IOException if there's a problem reading the HTML page
108             */
109            public HCardReader(InputStream in, String pageUrl) throws IOException {
110                    this.pageUrl = pageUrl;
111                    Document document = (pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl);
112                    String anchor = getAnchor(pageUrl);
113                    init(document, anchor);
114            }
115    
116            /**
117             * @param file the HTML file
118             * @throws IOException if there's a problem reading the HTML file
119             */
120            public HCardReader(File file) throws IOException {
121                    this(file, null);
122            }
123    
124            /**
125             * @param file the HTML file
126             * @param pageUrl the original URL of the HTML page
127             * @throws IOException if there's a problem reading the HTML file
128             */
129            public HCardReader(File file, String pageUrl) throws IOException {
130                    this.pageUrl = pageUrl;
131                    Document document = (pageUrl == null) ? Jsoup.parse(file, null, "") : Jsoup.parse(file, null, pageUrl);
132                    String anchor = getAnchor(pageUrl);
133                    init(document, anchor);
134            }
135    
136            /**
137             * @param reader the input stream to the HTML page
138             * @throws IOException if there's a problem reading the HTML page
139             */
140            public HCardReader(Reader reader) throws IOException {
141                    this(reader, null);
142            }
143    
144            /**
145             * @param reader the input stream to the HTML page
146             * @param pageUrl the original URL of the HTML page
147             * @throws IOException if there's a problem reading the HTML page
148             */
149            public HCardReader(Reader reader, String pageUrl) throws IOException {
150                    this.pageUrl = pageUrl;
151    
152                    StringBuilder sb = new StringBuilder();
153                    char buffer[] = new char[4096];
154                    int read;
155                    while ((read = reader.read(buffer)) != -1) {
156                            sb.append(buffer, 0, read);
157                    }
158                    String html = sb.toString();
159    
160                    Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
161                    String anchor = getAnchor(pageUrl);
162                    init(document, anchor);
163            }
164    
165            /**
166             * @param html the HTML page
167             */
168            public HCardReader(String html) {
169                    this(html, null);
170            }
171    
172            /**
173             * @param html the HTML page
174             * @param pageUrl the original URL of the HTML page
175             */
176            public HCardReader(String html, String pageUrl) {
177                    this.pageUrl = pageUrl;
178    
179                    Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
180                    String anchor = getAnchor(pageUrl);
181                    init(document, anchor);
182            }
183    
184            /**
185             * Constructor for reading embedded vCards.
186             * @param embeddedVCard the HTML element of the embedded vCard
187             * @param pageUrl the original URL of the HTML page
188             */
189            private HCardReader(Element embeddedVCard, String pageUrl) {
190                    this.pageUrl = pageUrl;
191                    vcardElements = new Elements(embeddedVCard);
192                    it = vcardElements.iterator();
193            }
194    
195            private void init(Document document, String anchor) {
196                    Element searchIn = null;
197                    if (anchor != null) {
198                            searchIn = document.getElementById(anchor);
199                    }
200                    if (searchIn == null) {
201                            searchIn = document;
202                    }
203    
204                    vcardElements = searchIn.getElementsByClass("vcard");
205                    it = vcardElements.iterator();
206            }
207    
208            /**
209             * Gets the anchor part of a URL.
210             * @param urlStr the URL
211             * @return the anchor (e.g. "foo" from the URL
212             * "http://example.com/index.php#foo")
213             */
214            private String getAnchor(String urlStr) {
215                    if (urlStr == null) {
216                            return null;
217                    }
218    
219                    try {
220                            URL url = new URL(urlStr);
221                            return url.getRef();
222                    } catch (MalformedURLException e) {
223                            return null;
224                    }
225            }
226    
227            //@Override
228            public void registerExtendedType(Class<? extends VCardType> clazz) {
229                    extendedTypeClasses.put(getTypeNameFromTypeClass(clazz), clazz);
230            }
231    
232            //@Override
233            public void unregisterExtendedType(Class<? extends VCardType> clazz) {
234                    extendedTypeClasses.remove(getTypeNameFromTypeClass(clazz));
235            }
236    
237            //@Override
238            public List<String> getWarnings() {
239                    return new ArrayList<String>(warnings);
240            }
241    
242            //@Override
243            public VCard readNext() {
244                    Element vcardElement = null;
245                    while (it.hasNext() && vcardElement == null) {
246                            vcardElement = it.next();
247    
248                            //if this element is a child of another "vcard" element, then ignore it because it's an embedded vcard
249                            if (HtmlUtils.isChildOf(vcardElement, vcardElements)) {
250                                    vcardElement = null;
251                            }
252                    }
253                    if (vcardElement == null) {
254                            return null;
255                    }
256    
257                    warnings.clear();
258                    warningsBuffer.clear();
259                    labels.clear();
260                    nickname = null;
261                    categories = null;
262    
263                    curVCard = new VCard();
264                    curVCard.setVersion(VCardVersion.V3_0);
265                    if (pageUrl != null) {
266                            curVCard.addSource(new SourceType(pageUrl));
267                    }
268    
269                    //visit all descendant nodes, depth-first
270                    for (Element child : vcardElement.children()) {
271                            visit(child);
272                    }
273    
274                    //assign labels to their addresses
275                    for (LabelType label : labels) {
276                            boolean orphaned = true;
277                            for (AddressType adr : curVCard.getAddresses()) {
278                                    if (adr.getLabel() == null && adr.getTypes().equals(label.getTypes())) {
279                                            adr.setLabel(label.getValue());
280                                            orphaned = false;
281                                            break;
282                                    }
283                            }
284                            if (orphaned) {
285                                    curVCard.addOrphanedLabel(label);
286                            }
287                    }
288    
289                    return curVCard;
290            }
291    
292            private void visit(Element element) {
293                    Set<String> classNames = element.classNames();
294                    for (String className : classNames) {
295                            if (UrlType.NAME.equalsIgnoreCase(className)) {
296                                    String href = element.attr("href");
297                                    if (href.length() > 0) {
298                                            if (!classNames.contains(EmailType.NAME.toLowerCase()) && href.matches("(?i)mailto:.*")) {
299                                                    className = EmailType.NAME;
300                                            } else if (!classNames.contains(TelephoneType.NAME.toLowerCase()) && href.matches("(?i)tel:.*")) {
301                                                    className = TelephoneType.NAME;
302                                            } else {
303                                                    //try parsing as IMPP
304                                                    warningsBuffer.clear();
305                                                    ImppType impp = new ImppType();
306                                                    try {
307                                                            impp.unmarshalHtml(element, warningsBuffer);
308                                                            addToVCard(impp, curVCard);
309                                                            warnings.addAll(warningsBuffer);
310                                                            continue;
311                                                    } catch (SkipMeException e) {
312                                                            //URL is not an instant messenger URL
313                                                    }
314                                            }
315                                    }
316                            }
317    
318                            VCardType type = createTypeObject(className);
319                            if (type == null) {
320                                    //if no type class is found, then it must be an arbitrary CSS class that has nothing to do with vCard
321                                    continue;
322                            }
323    
324                            warningsBuffer.clear();
325                            try {
326                                    type.unmarshalHtml(element, warningsBuffer);
327    
328                                    //add to vcard
329                                    if (type instanceof LabelType) {
330                                            //LABELs must be treated specially so they can be matched up with their ADRs
331                                            labels.add((LabelType) type);
332                                    } else if (type instanceof NicknameType) {
333                                            //add all NICKNAMEs to the same type object
334                                            NicknameType nn = (NicknameType) type;
335                                            if (nickname == null) {
336                                                    nickname = nn;
337                                                    addToVCard(nickname, curVCard);
338                                            } else {
339                                                    nickname.getValues().addAll(nn.getValues());
340                                            }
341                                    } else if (type instanceof CategoriesType) {
342                                            //add all CATEGORIES to the same type object
343                                            CategoriesType c = (CategoriesType) type;
344                                            if (categories == null) {
345                                                    categories = c;
346                                                    addToVCard(categories, curVCard);
347                                            } else {
348                                                    categories.getValues().addAll(c.getValues());
349                                            }
350                                    } else {
351                                            addToVCard(type, curVCard);
352                                    }
353                            } catch (SkipMeException e) {
354                                    warningsBuffer.add(type.getTypeName() + " property will not be unmarshalled: " + e.getMessage());
355                            } catch (EmbeddedVCardException e) {
356                                    if (HtmlUtils.isChildOf(element, embeddedVCards)) {
357                                            //prevents multiple-nested embedded elements from overwriting each other
358                                            continue;
359                                    }
360    
361                                    embeddedVCards.add(element);
362                                    HCardReader embeddedReader = new HCardReader(element, pageUrl);
363                                    try {
364                                            VCard embeddedVCard = embeddedReader.readNext();
365                                            e.injectVCard(embeddedVCard);
366                                    } finally {
367                                            for (String w : embeddedReader.getWarnings()) {
368                                                    warnings.add("Problem unmarshalling nested vCard value from " + type.getTypeName() + ": " + w);
369                                            }
370                                    }
371                                    addToVCard(type, curVCard);
372                            } catch (UnsupportedOperationException e) {
373                                    //type class does not support hCard
374                                    warningsBuffer.add("Type class \"" + type.getClass().getName() + "\" does not support hCard unmarshalling.");
375                            } finally {
376                                    warnings.addAll(warningsBuffer);
377                            }
378                    }
379    
380                    for (Element child : element.children()) {
381                            visit(child);
382                    }
383            }
384    
385            /**
386             * Creates the appropriate {@link VCardType} instance, given the type name.
387             * This method does not unmarshal the type, it just creates the type object.
388             * @param typeName the type name (e.g. "fn")
389             * @return the type object or null if the type name was not recognized
390             */
391            private VCardType createTypeObject(String typeName) {
392                    typeName = typeName.toLowerCase();
393                    VCardType t = null;
394                    Class<? extends VCardType> clazz = TypeList.getTypeClassByHCardTypeName(typeName);
395                    if (clazz != null) {
396                            try {
397                                    //create a new instance of the class
398                                    t = clazz.newInstance();
399                            } catch (Exception e) {
400                                    //it is the responsibility of the EZ-vCard developer to ensure that this exception is never thrown
401                                    //all type classes defined in the EZ-vCard library MUST have public, no-arg constructors
402                                    throw new RuntimeException(e);
403                            }
404                    } else {
405                            Class<? extends VCardType> extendedTypeClass = extendedTypeClasses.get(typeName);
406                            if (extendedTypeClass != null) {
407                                    try {
408                                            t = extendedTypeClass.newInstance();
409                                    } catch (Exception e) {
410                                            //this should never happen because the type class is checked to see if it has a public, no-arg constructor in the "registerExtendedType" method
411                                            throw new RuntimeException("Extended type class \"" + extendedTypeClass.getName() + "\" must have a public, no-arg constructor.");
412                                    }
413                            } else if (typeName.startsWith("x-")) {
414                                    t = new RawType(typeName); //use RawType instead of TextType because we don't want to unescape any characters that might be meaningful to this type
415                            }
416                    }
417                    return t;
418            }
419    
420            /**
421             * Adds a type object to the vCard.
422             * @param t the type object
423             * @param vcard the vCard
424             */
425            private void addToVCard(VCardType t, VCard vcard) {
426                    Method method = TypeList.getAddMethod(t.getClass());
427                    if (method != null) {
428                            try {
429                                    method.invoke(vcard, t);
430                            } catch (Exception e) {
431                                    //this should NEVER be thrown because the method MUST be public
432                                    throw new RuntimeException(e);
433                            }
434                    } else {
435                            vcard.addExtendedType(t);
436                    }
437            }
438    
439            /**
440             * Gets the type name from a type class.
441             * @param clazz the type class
442             * @return the type name
443             */
444            private String getTypeNameFromTypeClass(Class<? extends VCardType> clazz) {
445                    try {
446                            VCardType t = clazz.newInstance();
447                            return t.getTypeName().toLowerCase();
448                    } catch (Exception e) {
449                            //there is no public, no-arg constructor
450                            throw new RuntimeException(e);
451                    }
452            }
453    }