001package ezvcard.io.html;
002
003import static ezvcard.util.HtmlUtils.isChildOf;
004
005import java.io.File;
006import java.io.IOException;
007import java.io.InputStream;
008import java.io.Reader;
009import java.net.MalformedURLException;
010import java.net.URL;
011import java.util.ArrayList;
012import java.util.Iterator;
013import java.util.List;
014import java.util.Set;
015
016import org.jsoup.Jsoup;
017import org.jsoup.nodes.Document;
018import org.jsoup.nodes.Element;
019import org.jsoup.select.Elements;
020
021import ezvcard.VCard;
022import ezvcard.VCardVersion;
023import ezvcard.io.CannotParseException;
024import ezvcard.io.EmbeddedVCardException;
025import ezvcard.io.ParseWarning;
026import ezvcard.io.SkipMeException;
027import ezvcard.io.StreamReader;
028import ezvcard.io.scribe.RawPropertyScribe;
029import ezvcard.io.scribe.VCardPropertyScribe;
030import ezvcard.property.Categories;
031import ezvcard.property.Email;
032import ezvcard.property.Impp;
033import ezvcard.property.Label;
034import ezvcard.property.Nickname;
035import ezvcard.property.RawProperty;
036import ezvcard.property.Telephone;
037import ezvcard.property.Url;
038import ezvcard.property.VCardProperty;
039import ezvcard.util.Gobble;
040import ezvcard.util.IOUtils;
041
042/*
043 Copyright (c) 2012-2018, Michael Angstadt
044 All rights reserved.
045
046 Redistribution and use in source and binary forms, with or without
047 modification, are permitted provided that the following conditions are met: 
048
049 1. Redistributions of source code must retain the above copyright notice, this
050 list of conditions and the following disclaimer. 
051 2. Redistributions in binary form must reproduce the above copyright notice,
052 this list of conditions and the following disclaimer in the documentation
053 and/or other materials provided with the distribution. 
054
055 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
056 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
057 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
058 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
059 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
060 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
061 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
062 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
063 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
064 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
065
066 The views and conclusions contained in the software and documentation are those
067 of the authors and should not be interpreted as representing official policies, 
068 either expressed or implied, of the FreeBSD Project.
069 */
070
071/**
072 * <p>
073 * Parses {@link VCard} objects from an HTML page (hCard format).
074 * </p>
075 * <p>
076 * <b>Example:</b>
077 * </p>
078 * 
079 * <pre class="brush:java">
080 * URL url = new URL("http://example.com");
081 * HCardParser parser = new HCardParser(url);
082 * List&lt;VCard&gt; vcards = parser.parseAll();
083 * </pre>
084 * @author Michael Angstadt
085 * @see <a href="http://microformats.org/wiki/hcard">http://microformats.org/
086 * wiki/hcard</a>
087 */
088public class HCardParser extends StreamReader {
089        private final String pageUrl;
090        private final Elements vcardElements;
091        private final Iterator<Element> vcardElementsIt;
092        private final List<Label> labels = new ArrayList<Label>();
093
094        private VCard vcard;
095        private Elements embeddedVCards = new Elements();
096        private Nickname nickname;
097        private Categories categories;
098
099        private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase();
100        private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase();
101        private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase();
102        private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase();
103
104        /**
105         * Creates an hCard document.
106         * @param url the URL of the webpage
107         * @throws IOException if there's a problem loading the webpage
108         */
109        public HCardParser(URL url) throws IOException {
110                this(Jsoup.parse(url, 30000), url.toString());
111        }
112
113        /**
114         * Creates an hCard document.
115         * @param in the input stream to the HTML page
116         * @throws IOException if there's a problem reading the HTML page
117         */
118        public HCardParser(InputStream in) throws IOException {
119                this(in, null);
120        }
121
122        /**
123         * Creates an hCard document.
124         * @param in the input stream to the HTML page
125         * @param pageUrl the original URL of the HTML page (used to resolve
126         * relative links)
127         * @throws IOException if there's a problem reading the HTML page
128         */
129        public HCardParser(InputStream in, String pageUrl) throws IOException {
130                this((pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl), pageUrl);
131        }
132
133        /**
134         * Creates an hCard document.
135         * @param file the HTML file
136         * @throws IOException if there's a problem reading the HTML file
137         */
138        public HCardParser(File file) throws IOException {
139                this(file, null);
140        }
141
142        /**
143         * Creates an hCard document.
144         * @param file the HTML file
145         * @param pageUrl the original URL of the HTML page (used to resolve
146         * relative links)
147         * @throws IOException if there's a problem reading the HTML file
148         */
149        public HCardParser(File file, String pageUrl) throws IOException {
150                this((pageUrl == null) ? Jsoup.parse(file, null, "") : Jsoup.parse(file, null, pageUrl), pageUrl);
151        }
152
153        /**
154         * Creates an hCard document.
155         * @param reader the input stream to the HTML page
156         * @throws IOException if there's a problem reading the HTML page
157         */
158        public HCardParser(Reader reader) throws IOException {
159                this(reader, null);
160        }
161
162        /**
163         * Creates an hCard document.
164         * @param reader the input stream to the HTML page
165         * @param pageUrl the original URL of the HTML page (used to resolve
166         * relative links)
167         * @throws IOException if there's a problem reading the HTML page
168         */
169        public HCardParser(Reader reader, String pageUrl) throws IOException {
170                this(new Gobble(reader).asString(), pageUrl);
171        }
172
173        /**
174         * Creates an hCard document.
175         * @param html the HTML page
176         */
177        public HCardParser(String html) {
178                this(html, null);
179        }
180
181        /**
182         * Creates an hCard document.
183         * @param html the HTML page
184         * @param pageUrl the original URL of the HTML page (used to resolve
185         * relative links)
186         */
187        public HCardParser(String html, String pageUrl) {
188                this((pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl), pageUrl);
189        }
190
191        /**
192         * Creates an hCard document.
193         * @param document the HTML page
194         */
195        public HCardParser(Document document) {
196                this(document, null);
197        }
198
199        /**
200         * Creates an hCard document.
201         * @param document the HTML page
202         * @param pageUrl the original URL of the HTML page (used to resolve
203         * relative links)
204         */
205        public HCardParser(Document document, String pageUrl) {
206                this.pageUrl = pageUrl;
207
208                String anchor = null;
209                if (pageUrl != null) {
210                        try {
211                                URL url = new URL(pageUrl);
212                                anchor = url.getRef();
213                        } catch (MalformedURLException e) {
214                                anchor = null;
215                        }
216                }
217
218                Element searchUnder = null;
219                if (anchor != null) {
220                        searchUnder = document.getElementById(anchor);
221                }
222                if (searchUnder == null) {
223                        searchUnder = document;
224                }
225
226                vcardElements = searchUnder.getElementsByClass("vcard");
227
228                //remove nested vcard elements
229                Iterator<Element> it = vcardElements.iterator();
230                while (it.hasNext()) {
231                        Element element = it.next();
232                        if (isChildOf(element, vcardElements)) {
233                                it.remove();
234                        }
235                }
236
237                vcardElementsIt = vcardElements.iterator();
238        }
239
240        /**
241         * Constructor for reading embedded vCards.
242         * @param embeddedVCard the HTML element of the embedded vCard
243         * @param pageUrl the original URL of the HTML page
244         */
245        private HCardParser(Element embeddedVCard, String pageUrl) {
246                this.pageUrl = pageUrl;
247                vcardElements = new Elements(embeddedVCard);
248                vcardElementsIt = vcardElements.iterator();
249        }
250
251        @Override
252        public VCard readNext() {
253                try {
254                        return super.readNext();
255                } catch (IOException e) {
256                        //will not be thrown because reading from DOM
257                        throw new RuntimeException(e);
258                }
259        }
260
261        @Override
262        protected VCard _readNext() {
263                if (!vcardElementsIt.hasNext()) {
264                        return null;
265                }
266
267                context.setVersion(VCardVersion.V3_0);
268                parseVCardElement(vcardElementsIt.next());
269                return vcard;
270        }
271
272        private void parseVCardElement(Element vcardElement) {
273                labels.clear();
274                nickname = null;
275                categories = null;
276
277                vcard = new VCard();
278                vcard.setVersion(VCardVersion.V3_0);
279                if (pageUrl != null) {
280                        vcard.addSource(pageUrl);
281                }
282
283                //visit all descendant nodes, depth-first
284                for (Element child : vcardElement.children()) {
285                        visit(child);
286                }
287
288                //assign labels to their addresses
289                assignLabels(vcard, labels);
290        }
291
292        private void visit(Element element) {
293                boolean visitChildren = true;
294                Set<String> classNames = element.classNames();
295                for (String className : classNames) {
296                        className = className.toLowerCase();
297
298                        //give special treatment to certain URLs
299                        if (urlPropertyName.equals(className)) {
300                                String href = element.attr("href");
301                                if (href.length() > 0) {
302                                        if (!classNames.contains(emailName) && href.matches("(?i)mailto:.*")) {
303                                                className = emailName;
304                                        } else if (!classNames.contains(telName) && href.matches("(?i)tel:.*")) {
305                                                className = telName;
306                                        } else {
307                                                //try parsing as IMPP
308                                                VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class);
309
310                                                context.getWarnings().clear();
311                                                context.setPropertyName(scribe.getPropertyName());
312                                                try {
313                                                        VCardProperty property = scribe.parseHtml(new HCardElement(element), context);
314                                                        vcard.addProperty(property);
315                                                        warnings.addAll(context.getWarnings());
316                                                        continue;
317                                                } catch (SkipMeException e) {
318                                                        //URL is not an instant messenger URL
319                                                } catch (CannotParseException e) {
320                                                        //URL is not an instant messenger URL
321                                                }
322                                        }
323                                }
324                        }
325
326                        //hCard uses a different name for the CATEGORIES property
327                        if ("category".equals(className)) {
328                                className = categoriesName;
329                        }
330
331                        VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className);
332                        if (scribe == null) {
333                                //if no scribe is found, and the class name doesn't start with "x-", then it must be an arbitrary CSS class that has nothing to do with vCard
334                                if (!className.startsWith("x-")) {
335                                        continue;
336                                }
337                                scribe = new RawPropertyScribe(className);
338                        }
339
340                        context.getWarnings().clear();
341                        context.setPropertyName(scribe.getPropertyName());
342
343                        VCardProperty property;
344                        try {
345                                property = scribe.parseHtml(new HCardElement(element), context);
346                                warnings.addAll(context.getWarnings());
347
348                                //LABELs must be treated specially so they can be matched up with their ADRs
349                                if (property instanceof Label) {
350                                        labels.add((Label) property);
351                                        continue;
352                                }
353
354                                //add all NICKNAMEs to the same type object
355                                if (property instanceof Nickname) {
356                                        Nickname nn = (Nickname) property;
357                                        if (nickname == null) {
358                                                nickname = nn;
359                                                vcard.addProperty(nickname);
360                                        } else {
361                                                nickname.getValues().addAll(nn.getValues());
362                                        }
363                                        continue;
364                                }
365
366                                //add all CATEGORIES to the same type object
367                                if (property instanceof Categories) {
368                                        Categories c = (Categories) property;
369                                        if (categories == null) {
370                                                categories = c;
371                                                vcard.addProperty(categories);
372                                        } else {
373                                                categories.getValues().addAll(c.getValues());
374                                        }
375                                        continue;
376                                }
377                        } catch (SkipMeException e) {
378                                //@formatter:off
379                                warnings.add(new ParseWarning.Builder(context)
380                                        .message(22, e.getMessage())
381                                        .build()
382                                );
383                                //@formatter:on
384                                continue;
385                        } catch (CannotParseException e) {
386                                //@formatter:off
387                                warnings.add(new ParseWarning.Builder(context)
388                                        .message(e)
389                                        .build()
390                                );
391                                //@formatter:on
392
393                                property = new RawProperty(className, element.outerHtml());
394                        } catch (EmbeddedVCardException e) {
395                                if (isChildOf(element, embeddedVCards)) {
396                                        //prevents multiple-nested embedded elements from overwriting each other
397                                        continue;
398                                }
399
400                                property = e.getProperty();
401
402                                embeddedVCards.add(element);
403                                HCardParser embeddedReader = new HCardParser(element, pageUrl);
404                                try {
405                                        VCard embeddedVCard = embeddedReader.readNext();
406                                        e.injectVCard(embeddedVCard);
407                                } finally {
408                                        warnings.addAll(embeddedReader.getWarnings());
409                                        IOUtils.closeQuietly(embeddedReader);
410                                }
411                                visitChildren = false;
412                        }
413
414                        vcard.addProperty(property);
415                }
416
417                if (visitChildren) {
418                        for (Element child : element.children()) {
419                                visit(child);
420                        }
421                }
422        }
423
424        public void close() {
425                //empty
426        }
427}