001package ezvcard.io.html;
002
003import static ezvcard.util.HtmlUtils.isChildOf;
004
005import java.io.IOException;
006import java.io.InputStream;
007import java.io.Reader;
008import java.io.UncheckedIOException;
009import java.net.MalformedURLException;
010import java.net.URL;
011import java.nio.file.Path;
012import java.util.ArrayList;
013import java.util.Iterator;
014import java.util.List;
015import java.util.Set;
016
017import org.jsoup.Jsoup;
018import org.jsoup.nodes.Document;
019import org.jsoup.nodes.Element;
020import org.jsoup.select.Elements;
021
022import ezvcard.VCard;
023import ezvcard.VCardVersion;
024import ezvcard.io.CannotParseException;
025import ezvcard.io.EmbeddedVCardException;
026import ezvcard.io.ParseWarning;
027import ezvcard.io.SkipMeException;
028import ezvcard.io.StreamReader;
029import ezvcard.io.scribe.RawPropertyScribe;
030import ezvcard.io.scribe.VCardPropertyScribe;
031import ezvcard.property.Categories;
032import ezvcard.property.Email;
033import ezvcard.property.Impp;
034import ezvcard.property.Label;
035import ezvcard.property.Nickname;
036import ezvcard.property.RawProperty;
037import ezvcard.property.Telephone;
038import ezvcard.property.Url;
039import ezvcard.property.VCardProperty;
040import ezvcard.util.Gobble;
041import ezvcard.util.IOUtils;
042
043/*
044 Copyright (c) 2012-2023, Michael Angstadt
045 All rights reserved.
046
047 Redistribution and use in source and binary forms, with or without
048 modification, are permitted provided that the following conditions are met: 
049
050 1. Redistributions of source code must retain the above copyright notice, this
051 list of conditions and the following disclaimer. 
052 2. Redistributions in binary form must reproduce the above copyright notice,
053 this list of conditions and the following disclaimer in the documentation
054 and/or other materials provided with the distribution. 
055
056 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
057 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
058 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
059 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
060 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
061 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
062 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
063 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
064 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
065 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
066
067 The views and conclusions contained in the software and documentation are those
068 of the authors and should not be interpreted as representing official policies, 
069 either expressed or implied, of the FreeBSD Project.
070 */
071
072/**
073 * <p>
074 * Parses {@link VCard} objects from an HTML page (hCard format).
075 * </p>
076 * <p>
077 * <b>Example:</b>
078 * </p>
079 * 
080 * <pre class="brush:java">
081 * URL url = new URL("http://example.com");
082 * HCardParser parser = new HCardParser(url);
083 * List&lt;VCard&gt; vcards = parser.parseAll();
084 * </pre>
085 * @author Michael Angstadt
086 * @see <a href="http://microformats.org/wiki/hcard">http://microformats.org/
087 * wiki/hcard</a>
088 */
089public class HCardParser extends StreamReader {
090        private final String pageUrl;
091        private final Elements vcardElements;
092        private final Iterator<Element> vcardElementsIt;
093        private final List<Label> labels = new ArrayList<>();
094
095        private VCard vcard;
096        private Elements embeddedVCards = new Elements();
097        private Nickname nickname;
098        private Categories categories;
099
100        private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase();
101        private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase();
102        private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase();
103        private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase();
104
105        /**
106         * Creates an hCard document.
107         * @param url the URL of the webpage
108         * @throws IOException if there's a problem loading the webpage
109         */
110        public HCardParser(URL url) throws IOException {
111                this(Jsoup.parse(url, 30000), url.toString());
112        }
113
114        /**
115         * Creates an hCard document.
116         * @param in the input stream to the HTML page
117         * @throws IOException if there's a problem reading the HTML page
118         */
119        public HCardParser(InputStream in) throws IOException {
120                this(in, null);
121        }
122
123        /**
124         * Creates an hCard document.
125         * @param in the input stream to the HTML page
126         * @param pageUrl the original URL of the HTML page (used to resolve
127         * relative links)
128         * @throws IOException if there's a problem reading the HTML page
129         */
130        public HCardParser(InputStream in, String pageUrl) throws IOException {
131                this((pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl), pageUrl);
132        }
133
134        /**
135         * Creates an hCard document.
136         * @param file the HTML file
137         * @throws IOException if there's a problem reading the HTML file
138         */
139        public HCardParser(Path file) throws IOException {
140                this(file, null);
141        }
142
143        /**
144         * Creates an hCard document.
145         * @param file the HTML file
146         * @param pageUrl the original URL of the HTML page (used to resolve
147         * relative links)
148         * @throws IOException if there's a problem reading the HTML file
149         */
150        public HCardParser(Path file, String pageUrl) throws IOException {
151                this((pageUrl == null) ? Jsoup.parse(file.toFile(), null, "") : Jsoup.parse(file.toFile(), null, pageUrl), pageUrl);
152        }
153
154        /**
155         * Creates an hCard document.
156         * @param reader the input stream to the HTML page
157         * @throws IOException if there's a problem reading the HTML page
158         */
159        public HCardParser(Reader reader) throws IOException {
160                this(reader, null);
161        }
162
163        /**
164         * Creates an hCard document.
165         * @param reader the input stream to the HTML page
166         * @param pageUrl the original URL of the HTML page (used to resolve
167         * relative links)
168         * @throws IOException if there's a problem reading the HTML page
169         */
170        public HCardParser(Reader reader, String pageUrl) throws IOException {
171                this(new Gobble(reader).asString(), pageUrl);
172        }
173
174        /**
175         * Creates an hCard document.
176         * @param html the HTML page
177         */
178        public HCardParser(String html) {
179                this(html, null);
180        }
181
182        /**
183         * Creates an hCard document.
184         * @param html the HTML page
185         * @param pageUrl the original URL of the HTML page (used to resolve
186         * relative links)
187         */
188        public HCardParser(String html, String pageUrl) {
189                this((pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl), pageUrl);
190        }
191
192        /**
193         * Creates an hCard document.
194         * @param document the HTML page
195         */
196        public HCardParser(Document document) {
197                this(document, null);
198        }
199
200        /**
201         * Creates an hCard document.
202         * @param document the HTML page
203         * @param pageUrl the original URL of the HTML page (used to resolve
204         * relative links)
205         */
206        public HCardParser(Document document, String pageUrl) {
207                this.pageUrl = pageUrl;
208
209                String anchor = null;
210                if (pageUrl != null) {
211                        try {
212                                URL url = new URL(pageUrl);
213                                anchor = url.getRef();
214                        } catch (MalformedURLException e) {
215                                anchor = null;
216                        }
217                }
218
219                Element searchUnder = null;
220                if (anchor != null) {
221                        searchUnder = document.getElementById(anchor);
222                }
223                if (searchUnder == null) {
224                        searchUnder = document;
225                }
226
227                vcardElements = searchUnder.getElementsByClass("vcard");
228
229                //remove nested vcard elements
230                Iterator<Element> it = vcardElements.iterator();
231                while (it.hasNext()) {
232                        Element element = it.next();
233                        if (isChildOf(element, vcardElements)) {
234                                it.remove();
235                        }
236                }
237
238                vcardElementsIt = vcardElements.iterator();
239        }
240
241        /**
242         * Constructor for reading embedded vCards.
243         * @param embeddedVCard the HTML element of the embedded vCard
244         * @param pageUrl the original URL of the HTML page
245         */
246        private HCardParser(Element embeddedVCard, String pageUrl) {
247                this.pageUrl = pageUrl;
248                vcardElements = new Elements(embeddedVCard);
249                vcardElementsIt = vcardElements.iterator();
250        }
251
252        @Override
253        public VCard readNext() {
254                try {
255                        return super.readNext();
256                } catch (IOException e) {
257                        //will not be thrown because reading from DOM
258                        throw new UncheckedIOException(e);
259                }
260        }
261
262        @Override
263        protected VCard _readNext() {
264                if (!vcardElementsIt.hasNext()) {
265                        return null;
266                }
267
268                context.setVersion(VCardVersion.V3_0);
269                parseVCardElement(vcardElementsIt.next());
270                return vcard;
271        }
272
273        private void parseVCardElement(Element vcardElement) {
274                labels.clear();
275                nickname = null;
276                categories = null;
277
278                vcard = new VCard();
279                vcard.setVersion(VCardVersion.V3_0);
280                if (pageUrl != null) {
281                        vcard.addSource(pageUrl);
282                }
283
284                //visit all descendant nodes, depth-first
285                for (Element child : vcardElement.children()) {
286                        visit(child);
287                }
288
289                //assign labels to their addresses
290                assignLabels(vcard, labels);
291        }
292
293        private void visit(Element element) {
294                boolean visitChildren = true;
295                Set<String> classNames = element.classNames();
296                for (String className : classNames) {
297                        className = className.toLowerCase();
298
299                        //give special treatment to certain URLs
300                        if (urlPropertyName.equals(className)) {
301                                String href = element.attr("href");
302                                if (href.length() > 0) {
303                                        if (!classNames.contains(emailName) && href.matches("(?i)mailto:.*")) {
304                                                className = emailName;
305                                        } else if (!classNames.contains(telName) && href.matches("(?i)tel:.*")) {
306                                                className = telName;
307                                        } else {
308                                                //try parsing as IMPP
309                                                VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class);
310
311                                                context.getWarnings().clear();
312                                                context.setPropertyName(scribe.getPropertyName());
313                                                try {
314                                                        VCardProperty property = scribe.parseHtml(new HCardElement(element), context);
315                                                        vcard.addProperty(property);
316                                                        warnings.addAll(context.getWarnings());
317                                                        continue;
318                                                } catch (SkipMeException e) {
319                                                        //URL is not an instant messenger URL
320                                                } catch (CannotParseException e) {
321                                                        //URL is not an instant messenger URL
322                                                }
323                                        }
324                                }
325                        }
326
327                        //hCard uses a different name for the CATEGORIES property
328                        if ("category".equals(className)) {
329                                className = categoriesName;
330                        }
331
332                        VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className);
333                        if (scribe == null) {
334                                //if no scribe is found, and the class name doesn't start with "x-", then it must be an arbitrary CSS class that has nothing to do with vCard
335                                if (!className.startsWith("x-")) {
336                                        continue;
337                                }
338                                scribe = new RawPropertyScribe(className);
339                        }
340
341                        context.getWarnings().clear();
342                        context.setPropertyName(scribe.getPropertyName());
343
344                        VCardProperty property;
345                        try {
346                                property = scribe.parseHtml(new HCardElement(element), context);
347                                warnings.addAll(context.getWarnings());
348
349                                //LABELs must be treated specially so they can be matched up with their ADRs
350                                if (property instanceof Label) {
351                                        labels.add((Label) property);
352                                        continue;
353                                }
354
355                                //add all NICKNAMEs to the same type object
356                                if (property instanceof Nickname) {
357                                        Nickname nn = (Nickname) property;
358                                        if (nickname == null) {
359                                                nickname = nn;
360                                                vcard.addProperty(nickname);
361                                        } else {
362                                                nickname.getValues().addAll(nn.getValues());
363                                        }
364                                        continue;
365                                }
366
367                                //add all CATEGORIES to the same type object
368                                if (property instanceof Categories) {
369                                        Categories c = (Categories) property;
370                                        if (categories == null) {
371                                                categories = c;
372                                                vcard.addProperty(categories);
373                                        } else {
374                                                categories.getValues().addAll(c.getValues());
375                                        }
376                                        continue;
377                                }
378                        } catch (SkipMeException e) {
379                                //@formatter:off
380                                warnings.add(new ParseWarning.Builder(context)
381                                        .message(22, e.getMessage())
382                                        .build()
383                                );
384                                //@formatter:on
385                                continue;
386                        } catch (CannotParseException e) {
387                                //@formatter:off
388                                warnings.add(new ParseWarning.Builder(context)
389                                        .message(e)
390                                        .build()
391                                );
392                                //@formatter:on
393
394                                property = new RawProperty(className, element.outerHtml());
395                        } catch (EmbeddedVCardException e) {
396                                if (isChildOf(element, embeddedVCards)) {
397                                        //prevents multiple-nested embedded elements from overwriting each other
398                                        continue;
399                                }
400
401                                property = e.getProperty();
402
403                                embeddedVCards.add(element);
404                                HCardParser embeddedReader = new HCardParser(element, pageUrl);
405                                try {
406                                        VCard embeddedVCard = embeddedReader.readNext();
407                                        e.injectVCard(embeddedVCard);
408                                } finally {
409                                        warnings.addAll(embeddedReader.getWarnings());
410                                        IOUtils.closeQuietly(embeddedReader);
411                                }
412                                visitChildren = false;
413                        }
414
415                        vcard.addProperty(property);
416                }
417
418                if (visitChildren) {
419                        for (Element child : element.children()) {
420                                visit(child);
421                        }
422                }
423        }
424
425        public void close() {
426                //empty
427        }
428}