2 * This file is part of NeverNote
\r
3 * Copyright 2009 Randy Baumgarte
\r
5 * This file may be licensed under the terms of of the
\r
6 * GNU General Public License Version 2 (the ``GPL'').
\r
8 * Software distributed under the License is distributed
\r
9 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
\r
10 * express or implied. See the GPL for the specific language
\r
11 * governing rights and limitations.
\r
13 * You should have received a copy of the GPL along with this
\r
14 * program. If not, go to http://www.gnu.org/licenses/gpl.html
\r
15 * or write to the Free Software Foundation, Inc.,
\r
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
\r
19 package cx.fbn.nevernote.evernote;
\r
21 import java.io.File;
\r
22 import java.util.ArrayList;
\r
23 import java.util.List;
\r
25 import cx.fbn.nevernote.Global;
\r
26 import cx.fbn.nevernote.utilities.ApplicationLogger;
\r
27 import cx.fbn.nevernote.xml.XMLCleanup;
\r
28 import cx.fbn.nevernote.xml.XMLNoteRepair;
\r
30 public class EnmlConverter {
\r
31 private final ApplicationLogger logger;
\r
32 private List<String> resources;
\r
33 public boolean saveInvalidXML;
\r
35 public EnmlConverter(ApplicationLogger l) {
\r
38 saveInvalidXML = false;
\r
39 resources = new ArrayList<String>();
\r
42 public List<String> getResources() {
\r
45 public String convert(String noteGuid, String content) {
\r
46 logger.log(logger.HIGH, "Entering DBRunner.convertToEnml");
\r
47 logger.log(logger.EXTREME, "Note Text:" +content);
\r
49 // Replace the en-note tags with body tags in case we came from
\r
50 // someplace other than the editor (for example, if we are merging notes).
\r
51 content = content.replace("<en-note>", "<body>");
\r
52 content = content.replace("</en-note>", "</body>");
\r
53 // Start removing stuff we don't need or want
\r
54 int br = content.lastIndexOf("</body>");
\r
56 content = new String(content.substring(0,br));
\r
58 int k = content.indexOf("<body");
\r
60 newContent = new String(content.substring(k));
\r
62 newContent = "<body>"+content;
\r
65 // Check that we have a vaild header. Normally we should not
\r
66 // but sometimes it seems that we can. I don't see how, but it is
\r
67 // easy enough to check.
\r
68 if (!newContent.startsWith("<?xml"))
\r
69 newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
\r
70 +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n"
\r
75 // Fix the more common XML problems that Webkit creates, but are not considered
\r
77 newContent = fixStupidXMLProblems(newContent);
\r
80 // Change the contents to have enml instead of body tags or
\r
81 // we'll fail validation later.
\r
82 newContent = newContent.replace("<body", "<en-note");
\r
83 newContent = newContent.replace("</body>", "</en-note>");
\r
85 // First pass through the data. The goal of this pass is to
\r
86 // validate that we have a good XML document and to repair
\r
87 // any problems found.
\r
89 XMLNoteRepair repair = new XMLNoteRepair();
\r
90 logger.log(logger.HIGH, "Checking XML Structure");
\r
91 newContent = repair.parse(newContent, false);
\r
92 logger.log(logger.HIGH, "Check complete");
\r
95 // If the repair above returned null, then the XML is foobar.
\r
96 // We are done here.
\r
97 if (newContent == null) {
\r
98 // Houston, we've had a problem.
\r
99 logger.log(logger.LOW, "Parse error when converting to ENML");
\r
100 logger.log(logger.LOW, "Start of unmodified note HTML");
\r
101 logger.log(logger.LOW, content);
\r
102 logger.log(logger.LOW, "End of unmodified note HTML");
\r
103 logger.log(logger.LOW, "Start of modified note HTML");
\r
104 logger.log(logger.LOW, newContent);
\r
105 logger.log(logger.LOW, "End of modified note HTML");
\r
106 // logger.log(logger.LOW, result.errorMessage);
\r
107 // logger.log(logger.LOW, "Error Line:Column "+result.errorLine+":" +result.errorColumn);
\r
113 // Second pass through the data. The goal of this pass is to
\r
114 // remove any things we added in NeverNote that do not match
\r
116 XMLCleanup v = new XMLCleanup();
\r
117 v.setValue(newContent);
\r
118 logger.log(logger.HIGH, "Beginning ENML Cleanup");
\r
120 logger.log(logger.HIGH, "Cleanup complete.");
\r
124 // Final pass through the data. In this one we
\r
125 // remove any invalid attributes and to save the
\r
127 logger.log(logger.EXTREME, "Rebuilt ENML:");
\r
128 logger.log(logger.EXTREME, v.getValue());
\r
129 logger.log(logger.EXTREME, "End Of Rebuilt ENML:");
\r
130 resources = v.getResources();
\r
133 // The XML has the dtd to validate set against Evernote's web
\r
134 // address. We change it to a local one because otherwise it would
\r
135 // fail if the user doesn't have internet connectivity. The local copy
\r
136 // also contains the 3 other PUBLIC definitions at the beginning of the dtd.
\r
137 newContent = v.getValue();
\r
138 File dtdFile = new File(Global.getDirectoryPath()+"xml/enml2.dtd");
\r
139 String dtd = dtdFile.toURI().toString();
\r
140 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>",
\r
141 "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">");
\r
143 logger.log(logger.HIGH, "Validating ENML");
\r
144 newContent = repair.parse(newContent, true);
\r
145 logger.log(logger.HIGH, "Validation complete");
\r
146 saveInvalidXML = repair.saveInvalidXML;
\r
148 // Restore the correct XML header.
\r
149 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">",
\r
150 "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>");
\r
158 // Fix XML problems that Qt can't deal with
\r
159 public String fixStupidXMLProblems(String content) {
\r
160 logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems");
\r
162 // Fix the problem that the document body isn't properly closed
\r
163 String newContent = new String(content);
\r
164 logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems. Old content:");
\r
165 logger.log(logger.MEDIUM, content);
\r
167 // Fix the problem that the img tag isn't properly closed
\r
169 logger.log(logger.MEDIUM, "Checking img tags");
\r
170 for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) {
\r
171 endPos = newContent.indexOf(">",i+1);
\r
172 String end = newContent.substring(endPos+1);
\r
173 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
176 // Fix the problem that the input tag isn't properly closed
\r
177 logger.log(logger.MEDIUM, "Checking input tags");
\r
178 for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) {
\r
179 endPos = newContent.indexOf(">",i+1);
\r
180 String end = newContent.substring(endPos+1);
\r
181 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
185 // Fix the problem that the <br> tag isn't properly closed
\r
186 logger.log(logger.MEDIUM, "Checking br tags");
\r
187 for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) {
\r
188 endPos = newContent.indexOf(">",i+1);
\r
189 String end = newContent.substring(endPos+1);
\r
190 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
193 // Fix the problem that the <hr> tag isn't properly closed
\r
194 logger.log(logger.MEDIUM, "Checking hr tags");
\r
195 for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) {
\r
196 endPos = newContent.indexOf(">",i+1);
\r
197 String end = newContent.substring(endPos+1);
\r
198 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
201 logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems");
\r
202 logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems");
\r
203 return newContent.toString();
\r
207 // Fix XML that Evernote thinks is invalid
\r
208 public String fixEnXMLCrap(String note) {
\r
210 StringBuffer buffer = new StringBuffer(note);
\r
212 // change all <b/> to <b></b> because Evernote hates them if they happen in <span>
\r
213 pos = buffer.indexOf("<b/>");
\r
215 buffer.replace(pos, pos+4, "<b></b>");
\r
216 pos = buffer.indexOf("<b/>",pos);
\r
218 // change all <br/> to <br></br> because Evernote hates them if they happen in <span>
\r
219 pos = buffer.indexOf("<br/>");
\r
221 buffer.replace(pos, pos+5, "<br></br>");
\r
222 pos = buffer.indexOf("<br/>",pos);
\r
225 // change all <span> elements in lists because Evernote hates them if they happen
\r
228 pos = buffer.indexOf("<li>");
\r
229 spanPos = buffer.indexOf("<span>");
\r
230 /* for (; pos>-1 && spanPos >-1;) {
\r
231 endPos = buffer.indexOf("</li>",pos);
\r
232 if (spanPos > pos && spanPos < endPos) {
\r
233 buffer.replace(spanPos,spanPos+6,"");
\r
234 spanPos = buffer.indexOf("</span>");
\r
235 buffer.replace(spanPos,spanPos+7,"");
\r
237 pos=buffer.indexOf("<li>",pos+1);
\r
238 spanPos = buffer.indexOf("<span>",spanPos);
\r
241 // Get rid of empty spans in <li> elements
\r
242 pos = buffer.indexOf("<li>");
\r
243 spanPos = buffer.indexOf("<span/>");
\r
244 for (; pos>-1 && spanPos >-1;) {
\r
245 endPos = buffer.indexOf("</li>",pos);
\r
246 if (spanPos > pos && spanPos < endPos) {
\r
247 buffer.replace(spanPos,spanPos+7,"");
\r
249 pos=buffer.indexOf("<li>",pos+1);
\r
250 spanPos = buffer.indexOf("<span/>",spanPos);
\r
253 return buffer.toString();
\r
256 // Fix stupid en-media problems
\r
257 public String fixEnMediaCrap(String note) {
\r
261 StringBuffer buffer = new StringBuffer(note);
\r
262 // get rid of any </en-media> tags since they shouldn't exist.
\r
263 int pos = buffer.indexOf("</en-media>");
\r
265 buffer.replace(pos, pos+11, "");
\r
266 pos = buffer.indexOf("</en-media>",pos);
\r
270 // Make sure we have a proper /> ending the en-media tag
\r
271 pos = buffer.indexOf("<en-media");
\r
273 pos=buffer.indexOf(">", pos);
\r
274 if (!buffer.substring(pos-1,pos).equals("/"))
\r
275 buffer.replace(pos, pos+1, " />");
\r
276 pos = buffer.indexOf("<en-media",pos);
\r
279 return buffer.toString();
\r