HTParse.c 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187
  1. /* Parse HyperText Document Address HTParse.c
  2. * ================================
  3. */
  4. #include <HTUtils.h>
  5. #include <HTParse.h>
  6. #include <LYUtils.h>
  7. #include <LYLeaks.h>
  8. #include <LYStrings.h>
  9. #include <LYCharUtils.h>
  10. #ifdef HAVE_ALLOCA_H
  11. #include <alloca.h>
  12. #else
  13. #ifdef __MINGW32__
  14. #include <malloc.h>
  15. #endif /* __MINGW32__ */
  16. #endif
  17. #define HEX_ESCAPE '%'
  18. struct struct_parts {
  19. char *access;
  20. char *host;
  21. char *absolute;
  22. char *relative;
  23. char *search; /* treated normally as part of path */
  24. char *anchor;
  25. };
  26. #if 0 /* for debugging */
  27. static void show_parts(const char *name, struct struct_parts *parts, int line)
  28. {
  29. if (TRACE) {
  30. CTRACE((tfp, "struct_parts(%s) %s@%d\n", name, __FILE__, line));
  31. CTRACE((tfp, " access '%s'\n", NONNULL(parts->access)));
  32. CTRACE((tfp, " host '%s'\n", NONNULL(parts->host)));
  33. CTRACE((tfp, " absolute '%s'\n", NONNULL(parts->absolute)));
  34. CTRACE((tfp, " relative '%s'\n", NONNULL(parts->relative)));
  35. CTRACE((tfp, " search '%s'\n", NONNULL(parts->search)));
  36. CTRACE((tfp, " anchor '%s'\n", NONNULL(parts->anchor)));
  37. }
  38. }
  39. #define SHOW_PARTS(name) show_parts(#name, &name, __LINE__)
  40. #else
  41. #define SHOW_PARTS(name) /* nothing */
  42. #endif
  43. /* Strip white space off a string. HTStrip()
  44. * -------------------------------
  45. *
  46. * On exit,
  47. * Return value points to first non-white character, or to 0 if none.
  48. * All trailing white space is OVERWRITTEN with zero.
  49. */
  50. char *HTStrip(char *s)
  51. {
  52. #define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n'))
  53. char *p = s;
  54. for (p = s; *p; p++) ; /* Find end of string */
  55. for (p--; p >= s; p--) {
  56. if (SPACE(*p))
  57. *p = '\0'; /* Zap trailing blanks */
  58. else
  59. break;
  60. }
  61. while (SPACE(*s))
  62. s++; /* Strip leading blanks */
  63. return s;
  64. }
  65. /* Scan a filename for its constituents. scan()
  66. * -------------------------------------
  67. *
  68. * On entry,
  69. * name points to a document name which may be incomplete.
  70. * On exit,
  71. * absolute or relative may be nonzero (but not both).
  72. * host, anchor and access may be nonzero if they were specified.
  73. * Any which are nonzero point to zero terminated strings.
  74. */
  75. static void scan(char *name,
  76. struct struct_parts *parts)
  77. {
  78. char *after_access;
  79. char *p;
  80. parts->access = NULL;
  81. parts->host = NULL;
  82. parts->absolute = NULL;
  83. parts->relative = NULL;
  84. parts->search = NULL; /* normally not used - kw */
  85. parts->anchor = NULL;
  86. /*
  87. * Scan left-to-right for a scheme (access).
  88. */
  89. after_access = name;
  90. for (p = name; *p; p++) {
  91. if (*p == ':') {
  92. *p = '\0';
  93. parts->access = name; /* Access name has been specified */
  94. after_access = (p + 1);
  95. break;
  96. }
  97. if (*p == '/' || *p == '#' || *p == ';' || *p == '?')
  98. break;
  99. }
  100. /*
  101. * Scan left-to-right for a fragment (anchor).
  102. */
  103. for (p = after_access; *p; p++) {
  104. if (*p == '#') {
  105. parts->anchor = (p + 1);
  106. *p = '\0'; /* terminate the rest */
  107. break; /* leave things after first # alone - kw */
  108. }
  109. }
  110. /*
  111. * Scan left-to-right for a host or absolute path.
  112. */
  113. p = after_access;
  114. if (*p == '/') {
  115. if (p[1] == '/') {
  116. parts->host = (p + 2); /* host has been specified */
  117. *p = '\0'; /* Terminate access */
  118. p = strchr(parts->host, '/'); /* look for end of host name if any */
  119. if (p != NULL) {
  120. *p = '\0'; /* Terminate host */
  121. parts->absolute = (p + 1); /* Root has been found */
  122. } else {
  123. p = strchr(parts->host, '?');
  124. if (p != NULL) {
  125. *p = '\0'; /* Terminate host */
  126. parts->search = (p + 1);
  127. }
  128. }
  129. } else {
  130. parts->absolute = (p + 1); /* Root found but no host */
  131. }
  132. } else {
  133. parts->relative = (*after_access) ?
  134. after_access : NULL; /* NULL for "" */
  135. }
  136. /*
  137. * Check schemes that commonly have unescaped hashes.
  138. */
  139. if (parts->access && parts->anchor &&
  140. /* optimize */ strchr("lnsdLNSD", *parts->access) != NULL) {
  141. if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) ||
  142. !strcasecomp(parts->access, "nntp") ||
  143. !strcasecomp(parts->access, "snews") ||
  144. !strcasecomp(parts->access, "news") ||
  145. !strcasecomp(parts->access, "data")) {
  146. /*
  147. * Access specified but no host and not a lynxcgi URL, so the
  148. * anchor may not really be one, e.g., news:j462#36487@foo.bar, or
  149. * it's an nntp or snews URL, or news URL with a host. Restore the
  150. * '#' in the address.
  151. */
  152. /* but only if we have found a path component of which this will
  153. * become part. - kw */
  154. if (parts->relative || parts->absolute) {
  155. *(parts->anchor - 1) = '#';
  156. parts->anchor = NULL;
  157. }
  158. }
  159. }
  160. } /*scan */
  161. #if defined(HAVE_ALLOCA) && !defined(LY_FIND_LEAKS)
  162. #define LYalloca(x) alloca(x)
  163. #define LYalloca_free(x) {}
  164. #else
  165. #define LYalloca(x) malloc(x)
  166. #define LYalloca_free(x) free(x)
  167. #endif
  168. static char *strchr_or_end(char *string, int ch)
  169. {
  170. char *result = strchr(string, ch);
  171. if (result == 0) {
  172. result = string + strlen(string);
  173. }
  174. return result;
  175. }
  176. /* Parse a Name relative to another name. HTParse()
  177. * --------------------------------------
  178. *
  179. * This returns those parts of a name which are given (and requested)
  180. * substituting bits from the related name where necessary.
  181. *
  182. * On entry,
  183. * aName A filename given
  184. * relatedName A name relative to which aName is to be parsed
  185. * wanted A mask for the bits which are wanted.
  186. *
  187. * On exit,
  188. * returns A pointer to a malloc'd string which MUST BE FREED
  189. */
  190. char *HTParse(const char *aName,
  191. const char *relatedName,
  192. int wanted)
  193. {
  194. char *result = NULL;
  195. char *tail = NULL; /* a pointer to the end of the 'result' string */
  196. char *return_value = NULL;
  197. int len, len1, len2;
  198. char *name = NULL;
  199. char *rel = NULL;
  200. char *p, *q;
  201. char *acc_method;
  202. struct struct_parts given, related;
  203. CTRACE((tfp, "HTParse: aName:`%s'\n", aName));
  204. CTRACE((tfp, " relatedName:`%s'\n", relatedName));
  205. if (wanted & (PARSE_STRICTPATH | PARSE_QUERY)) { /* if detail wanted... */
  206. if ((wanted & (PARSE_STRICTPATH | PARSE_QUERY))
  207. == (PARSE_STRICTPATH | PARSE_QUERY)) /* if strictpath AND query */
  208. wanted |= PARSE_PATH; /* then treat as if PARSE_PATH wanted */
  209. if (wanted & PARSE_PATH) /* if PARSE_PATH wanted */
  210. wanted &= ~(PARSE_STRICTPATH | PARSE_QUERY); /* ignore details */
  211. }
  212. /* *INDENT-OFF* */
  213. CTRACE((tfp, " want:%s%s%s%s%s%s%s\n",
  214. wanted & PARSE_PUNCTUATION ? " punc" : "",
  215. wanted & PARSE_ANCHOR ? " anchor" : "",
  216. wanted & PARSE_PATH ? " path" : "",
  217. wanted & PARSE_HOST ? " host" : "",
  218. wanted & PARSE_ACCESS ? " access" : "",
  219. wanted & PARSE_STRICTPATH ? " PATH" : "",
  220. wanted & PARSE_QUERY ? " QUERY" : ""));
  221. /* *INDENT-ON* */
  222. /*
  223. * Allocate the temporary string. Optimized.
  224. */
  225. len1 = strlen(aName) + 1;
  226. len2 = strlen(relatedName) + 1;
  227. len = len1 + len2 + 8; /* Lots of space: more than enough */
  228. result = tail = (char *) LYalloca(len * 2 + len1 + len2);
  229. if (result == NULL) {
  230. outofmem(__FILE__, "HTParse");
  231. }
  232. *result = '\0';
  233. name = result + len;
  234. rel = name + len1;
  235. /*
  236. * Make working copy of the input string to cut up.
  237. */
  238. memcpy(name, aName, len1);
  239. /*
  240. * Cut up the string into URL fields.
  241. */
  242. scan(name, &given);
  243. SHOW_PARTS(given);
  244. /*
  245. * Now related string.
  246. */
  247. if ((given.access && given.host && given.absolute) || !*relatedName) {
  248. /*
  249. * Inherit nothing!
  250. */
  251. related.access = NULL;
  252. related.host = NULL;
  253. related.absolute = NULL;
  254. related.relative = NULL;
  255. related.search = NULL;
  256. related.anchor = NULL;
  257. } else {
  258. memcpy(rel, relatedName, len2);
  259. scan(rel, &related);
  260. }
  261. SHOW_PARTS(related);
  262. /*
  263. * Handle the scheme (access) field.
  264. */
  265. if (given.access && given.host && !given.relative && !given.absolute) {
  266. if (!strcmp(given.access, "http") ||
  267. !strcmp(given.access, "https") ||
  268. !strcmp(given.access, "ftp"))
  269. /*
  270. * Assume root.
  271. */
  272. given.absolute = "";
  273. }
  274. acc_method = given.access ? given.access : related.access;
  275. if (wanted & PARSE_ACCESS) {
  276. if (acc_method) {
  277. strcpy(tail, acc_method);
  278. tail += strlen(tail);
  279. if (wanted & PARSE_PUNCTUATION) {
  280. *tail++ = ':';
  281. *tail = '\0';
  282. }
  283. }
  284. }
  285. /*
  286. * If different schemes, inherit nothing.
  287. *
  288. * We'll try complying with RFC 1808 and the Fielding draft, and inherit
  289. * nothing if both schemes are given, rather than only when they differ,
  290. * except for file URLs - FM
  291. *
  292. * After trying it for a while, it's still premature, IHMO, to go along
  293. * with it, so this is back to inheriting for identical schemes whether or
  294. * not they are "file". If you want to try it again yourself, uncomment
  295. * the strcasecomp() below. - FM
  296. */
  297. if ((given.access && related.access) &&
  298. ( /* strcasecomp(given.access, "file") || */
  299. strcmp(given.access, related.access))) {
  300. related.host = NULL;
  301. related.absolute = NULL;
  302. related.relative = NULL;
  303. related.search = NULL;
  304. related.anchor = NULL;
  305. }
  306. /*
  307. * Handle the host field.
  308. */
  309. if (wanted & PARSE_HOST) {
  310. if (given.host || related.host) {
  311. if (wanted & PARSE_PUNCTUATION) {
  312. *tail++ = '/';
  313. *tail++ = '/';
  314. }
  315. strcpy(tail, given.host ? given.host : related.host);
  316. #define CLEAN_URLS
  317. #ifdef CLEAN_URLS
  318. /*
  319. * Ignore default port numbers, and trailing dots on FQDNs, which
  320. * will only cause identical addresses to look different. (related
  321. * is already a clean url).
  322. */
  323. {
  324. char *p2, *h;
  325. if ((p2 = strchr(result, '@')) != NULL)
  326. tail = (p2 + 1);
  327. p2 = strchr(tail, ':');
  328. if (p2 != NULL && !isdigit(UCH(p2[1])))
  329. /*
  330. * Colon not followed by a port number.
  331. */
  332. *p2 = '\0';
  333. if (p2 != NULL && *p2 != '\0' && acc_method != NULL) {
  334. /*
  335. * Port specified.
  336. */
  337. #define ACC_METHOD(a,b) (!strcmp(acc_method, a) && !strcmp(p2, b))
  338. if (ACC_METHOD("http", ":80") ||
  339. ACC_METHOD("https", ":443") ||
  340. ACC_METHOD("gopher", ":70") ||
  341. ACC_METHOD("ftp", ":21") ||
  342. ACC_METHOD("wais", ":210") ||
  343. ACC_METHOD("nntp", ":119") ||
  344. ACC_METHOD("news", ":119") ||
  345. ACC_METHOD("newspost", ":119") ||
  346. ACC_METHOD("newsreply", ":119") ||
  347. ACC_METHOD("snews", ":563") ||
  348. ACC_METHOD("snewspost", ":563") ||
  349. ACC_METHOD("snewsreply", ":563") ||
  350. ACC_METHOD("finger", ":79") ||
  351. ACC_METHOD("telnet", ":23") ||
  352. ACC_METHOD("tn3270", ":23") ||
  353. ACC_METHOD("rlogin", ":513") ||
  354. ACC_METHOD("cso", ":105"))
  355. *p2 = '\0'; /* It is the default: ignore it */
  356. }
  357. if (p2 == NULL) {
  358. int len3 = strlen(tail);
  359. if (len3 > 0) {
  360. h = tail + len3 - 1; /* last char of hostname */
  361. if (*h == '.')
  362. *h = '\0'; /* chop final . */
  363. }
  364. } else if (p2 != result) {
  365. h = p2;
  366. h--; /* End of hostname */
  367. if (*h == '.') {
  368. /*
  369. * Slide p2 over h.
  370. */
  371. while (*p2 != '\0')
  372. *h++ = *p2++;
  373. *h = '\0'; /* terminate */
  374. }
  375. }
  376. }
  377. #endif /* CLEAN_URLS */
  378. }
  379. }
  380. /*
  381. * Trim any blanks from the result so far - there's no excuse for blanks
  382. * in a hostname. Also update the tail here.
  383. */
  384. tail = LYRemoveBlanks(result);
  385. /*
  386. * If host in given or related was ended directly with a '?' (no slash),
  387. * fake the search part into absolute. This is the only case search is
  388. * returned from scan. A host must have been present. this restores the
  389. * '?' at which the host part had been truncated in scan, we have to do
  390. * this after host part handling is done. - kw
  391. */
  392. if (given.search && *(given.search - 1) == '\0') {
  393. given.absolute = given.search - 1;
  394. given.absolute[0] = '?';
  395. } else if (related.search && !related.absolute &&
  396. *(related.search - 1) == '\0') {
  397. related.absolute = related.search - 1;
  398. related.absolute[0] = '?';
  399. }
  400. /*
  401. * If different hosts, inherit no path.
  402. */
  403. if (given.host && related.host)
  404. if (strcmp(given.host, related.host) != 0) {
  405. related.absolute = NULL;
  406. related.relative = NULL;
  407. related.anchor = NULL;
  408. }
  409. /*
  410. * Handle the path.
  411. */
  412. if (wanted & (PARSE_PATH | PARSE_STRICTPATH | PARSE_QUERY)) {
  413. int want_detail = (wanted & (PARSE_STRICTPATH | PARSE_QUERY));
  414. if (acc_method && !given.absolute && given.relative) {
  415. /*
  416. * Treat all given nntp or snews paths, or given paths for news
  417. * URLs with a host, as absolute.
  418. */
  419. switch (*acc_method) {
  420. case 'N':
  421. case 'n':
  422. if (!strcasecomp(acc_method, "nntp") ||
  423. (!strcasecomp(acc_method, "news") &&
  424. !strncasecomp(result, "news://", 7))) {
  425. given.absolute = given.relative;
  426. given.relative = NULL;
  427. }
  428. break;
  429. case 'S':
  430. case 's':
  431. if (!strcasecomp(acc_method, "snews")) {
  432. given.absolute = given.relative;
  433. given.relative = NULL;
  434. }
  435. break;
  436. }
  437. }
  438. if (given.absolute) { /* All is given */
  439. if (wanted & PARSE_PUNCTUATION)
  440. *tail++ = '/';
  441. strcpy(tail, given.absolute);
  442. CTRACE((tfp, "HTParse: (ABS)\n"));
  443. } else if (related.absolute) { /* Adopt path not name */
  444. char *base = tail;
  445. *tail++ = '/';
  446. strcpy(tail, related.absolute);
  447. if (given.relative) {
  448. /* RFC 1808 part 4 step 5 (if URL path is empty) */
  449. /* a) if given has params, add/replace that */
  450. if (given.relative[0] == ';') {
  451. strcpy(strchr_or_end(tail, ';'), given.relative);
  452. }
  453. /* b) if given has query, add/replace that */
  454. else if (given.relative[0] == '?') {
  455. strcpy(strchr_or_end(tail, '?'), given.relative);
  456. }
  457. /* otherwise fall through to RFC 1808 part 4 step 6 */
  458. else {
  459. p = strchr(tail, '?'); /* Search part? */
  460. if (p == NULL)
  461. p = (tail + strlen(tail) - 1);
  462. for (; *p != '/'; p--) ; /* last / */
  463. p[1] = '\0'; /* Remove filename */
  464. strcat(p, given.relative); /* Add given one */
  465. }
  466. HTSimplify(base);
  467. if (*base == '\0')
  468. strcpy(base, "/");
  469. }
  470. CTRACE((tfp, "HTParse: (Related-ABS)\n"));
  471. } else if (given.relative) {
  472. strcpy(tail, given.relative); /* what we've got */
  473. CTRACE((tfp, "HTParse: (REL)\n"));
  474. } else if (related.relative) {
  475. strcpy(tail, related.relative);
  476. CTRACE((tfp, "HTParse: (Related-REL)\n"));
  477. } else { /* No inheritance */
  478. if (!isLYNXCGI(aName) &&
  479. !isLYNXEXEC(aName) &&
  480. !isLYNXPROG(aName)) {
  481. *tail++ = '/';
  482. *tail = '\0';
  483. }
  484. if (!strcmp(result, "news:/"))
  485. result[5] = '*';
  486. CTRACE((tfp, "HTParse: (No inheritance)\n"));
  487. }
  488. if (want_detail) {
  489. p = strchr(tail, '?'); /* Search part? */
  490. if (p) {
  491. if (PARSE_STRICTPATH) {
  492. *p = '\0';
  493. } else {
  494. if (!(wanted & PARSE_PUNCTUATION))
  495. p++;
  496. do {
  497. *tail++ = *p;
  498. } while (*p++);
  499. }
  500. } else {
  501. if (wanted & PARSE_QUERY)
  502. *tail = '\0';
  503. }
  504. }
  505. }
  506. /*
  507. * Handle the fragment (anchor). Never inherit.
  508. */
  509. if (wanted & PARSE_ANCHOR) {
  510. if (given.anchor && *given.anchor) {
  511. tail += strlen(tail);
  512. if (wanted & PARSE_PUNCTUATION)
  513. *tail++ = '#';
  514. strcpy(tail, given.anchor);
  515. }
  516. }
  517. /*
  518. * If there are any blanks remaining in the string, escape them as needed.
  519. * See the discussion in LYLegitimizeHREF() for example.
  520. */
  521. if ((p = strchr(result, ' ')) != 0) {
  522. switch (is_url(result)) {
  523. case UNKNOWN_URL_TYPE:
  524. CTRACE((tfp, "HTParse: ignore:`%s'\n", result));
  525. break;
  526. case LYNXEXEC_URL_TYPE:
  527. case LYNXPROG_URL_TYPE:
  528. case LYNXCGI_URL_TYPE:
  529. case LYNXPRINT_URL_TYPE:
  530. case LYNXHIST_URL_TYPE:
  531. case LYNXDOWNLOAD_URL_TYPE:
  532. case LYNXKEYMAP_URL_TYPE:
  533. case LYNXIMGMAP_URL_TYPE:
  534. case LYNXCOOKIE_URL_TYPE:
  535. case LYNXDIRED_URL_TYPE:
  536. case LYNXOPTIONS_URL_TYPE:
  537. case LYNXCFG_URL_TYPE:
  538. case LYNXCOMPILE_OPTS_URL_TYPE:
  539. case LYNXMESSAGES_URL_TYPE:
  540. CTRACE((tfp, "HTParse: spaces:`%s'\n", result));
  541. break;
  542. case NOT_A_URL_TYPE:
  543. default:
  544. CTRACE((tfp, "HTParse: encode:`%s'\n", result));
  545. do {
  546. q = p + strlen(p) + 2;
  547. while (q != p + 1) {
  548. q[0] = q[-2];
  549. --q;
  550. }
  551. p[0] = '%';
  552. p[1] = '2';
  553. p[2] = '0';
  554. } while ((p = strchr(result, ' ')) != 0);
  555. break;
  556. }
  557. }
  558. CTRACE((tfp, "HTParse: result:`%s'\n", result));
  559. StrAllocCopy(return_value, result);
  560. LYalloca_free(result);
  561. /* FIXME: could be optimized using HTParse() internals */
  562. if (*relatedName &&
  563. ((wanted & PARSE_ALL_WITHOUT_ANCHOR) == PARSE_ALL_WITHOUT_ANCHOR)) {
  564. /*
  565. * Check whether to fill in localhost. - FM
  566. */
  567. LYFillLocalFileURL(&return_value, relatedName);
  568. CTRACE((tfp, "pass LYFillLocalFile:`%s'\n", return_value));
  569. }
  570. return return_value; /* exactly the right length */
  571. }
  572. /* HTParseAnchor(), fast HTParse() specialization
  573. * ----------------------------------------------
  574. *
  575. * On exit,
  576. * returns A pointer within input string (probably to its end '\0')
  577. */
  578. const char *HTParseAnchor(const char *aName)
  579. {
  580. const char *p = aName;
  581. for (; *p && *p != '#'; p++) ;
  582. if (*p == '#') {
  583. /* the safe way based on HTParse() -
  584. * keeping in mind scan() peculiarities on schemes:
  585. */
  586. struct struct_parts given;
  587. char *name = (char *) LYalloca((p - aName) + strlen(p) + 1);
  588. if (name == NULL) {
  589. outofmem(__FILE__, "HTParseAnchor");
  590. }
  591. strcpy(name, aName);
  592. scan(name, &given);
  593. LYalloca_free(name);
  594. p++; /*next to '#' */
  595. if (given.anchor == NULL) {
  596. for (; *p; p++) /*scroll to end '\0' */
  597. ;
  598. }
  599. }
  600. return p;
  601. }
  602. /* Simplify a filename. HTSimplify()
  603. * --------------------
  604. *
  605. * A unix-style file is allowed to contain the sequence xxx/../ which may
  606. * be replaced by "" , and the sequence "/./" which may be replaced by "/".
  607. * Simplification helps us recognize duplicate filenames.
  608. *
  609. * Thus, /etc/junk/../fred becomes /etc/fred
  610. * /etc/junk/./fred becomes /etc/junk/fred
  611. *
  612. * but we should NOT change
  613. * http://fred.xxx.edu/../..
  614. *
  615. * or ../../albert.html
  616. */
  617. void HTSimplify(char *filename)
  618. {
  619. char *p;
  620. char *q, *q1;
  621. if (filename == NULL)
  622. return;
  623. if (!(filename[0] && filename[1]) ||
  624. filename[0] == '?' || filename[1] == '?' || filename[2] == '?')
  625. return;
  626. if (strchr(filename, '/') != NULL) {
  627. for (p = (filename + 2); *p; p++) {
  628. if (*p == '?') {
  629. /*
  630. * We're still treating a ?searchpart as part of the path in
  631. * HTParse() and scan(), but if we encounter a '?' here, assume
  632. * it's the delimiter and break. We also could check for a
  633. * parameter delimiter (';') here, but the current Fielding
  634. * draft (wisely or ill-advisedly :) says that it should be
  635. * ignored and collapsing be allowed in it's value). The only
  636. * defined parameter at present is ;type=[A, I, or D] for ftp
  637. * URLs, so if there's a "/..", "/../", "/./", or terminal '.'
  638. * following the ';', it must be due to the ';' being an
  639. * unescaped path character and not actually a parameter
  640. * delimiter. - FM
  641. */
  642. break;
  643. }
  644. if (*p == '/') {
  645. if ((p[1] == '.') && (p[2] == '.') &&
  646. (p[3] == '/' || p[3] == '?' || p[3] == '\0')) {
  647. /*
  648. * Handle "../", "..?" or "..".
  649. */
  650. for (q = (p - 1); (q >= filename) && (*q != '/'); q--)
  651. /*
  652. * Back up to previous slash or beginning of string.
  653. */
  654. ;
  655. if ((q[0] == '/') &&
  656. (strncmp(q, "/../", 4) &&
  657. strncmp(q, "/..?", 4)) &&
  658. !((q - 1) > filename && q[-1] == '/')) {
  659. /*
  660. * Not at beginning of string or in a host field, so
  661. * remove the "/xxx/..".
  662. */
  663. q1 = (p + 3);
  664. p = q;
  665. while (*q1 != '\0')
  666. *p++ = *q1++;
  667. *p = '\0'; /* terminate */
  668. /*
  669. * Start again with previous slash.
  670. */
  671. p = (q - 1);
  672. }
  673. } else if (p[1] == '.' && p[2] == '/') {
  674. /*
  675. * Handle "./" by removing both characters.
  676. */
  677. q = p;
  678. q1 = (p + 2);
  679. while (*q1 != '\0')
  680. *q++ = *q1++;
  681. *q = '\0'; /* terminate */
  682. p--;
  683. } else if (p[1] == '.' && p[2] == '?') {
  684. /*
  685. * Handle ".?" by removing the dot.
  686. */
  687. q = (p + 1);
  688. q1 = (p + 2);
  689. while (*q1 != '\0')
  690. *q++ = *q1++;
  691. *q = '\0'; /* terminate */
  692. p--;
  693. } else if (p[1] == '.' && p[2] == '\0') {
  694. /*
  695. * Handle terminal "." by removing the character.
  696. */
  697. p[1] = '\0';
  698. }
  699. }
  700. }
  701. if (p >= filename + 2 && *p == '?' && *(p - 1) == '.') {
  702. if (*(p - 2) == '/') {
  703. /*
  704. * Handle "/.?" by removing the dot.
  705. */
  706. q = p - 1;
  707. q1 = p;
  708. while (*q1 != '\0')
  709. *q++ = *q1++;
  710. *q = '\0';
  711. } else if (*(p - 2) == '.' &&
  712. p >= filename + 4 && *(p - 3) == '/' &&
  713. (*(p - 4) != '/' ||
  714. (p > filename + 4 && *(p - 5) != ':'))) {
  715. /*
  716. * Handle "xxx/..?"
  717. */
  718. for (q = (p - 4); (q > filename) && (*q != '/'); q--)
  719. /*
  720. * Back up to previous slash or beginning of string.
  721. */
  722. ;
  723. if (*q == '/') {
  724. if (q > filename && *(q - 1) == '/' &&
  725. !(q > filename + 1 && *(q - 1) != ':'))
  726. return;
  727. q++;
  728. }
  729. if (strncmp(q, "../", 3) && strncmp(q, "./", 2)) {
  730. /*
  731. * Not after "//" at beginning of string or after "://",
  732. * and xxx is not ".." or ".", so remove the "xxx/..".
  733. */
  734. q1 = p;
  735. p = q;
  736. while (*q1 != '\0')
  737. *p++ = *q1++;
  738. *p = '\0'; /* terminate */
  739. }
  740. }
  741. }
  742. }
  743. }
  744. /* Make Relative Name. HTRelative()
  745. * -------------------
  746. *
  747. * This function creates and returns a string which gives an expression of
  748. * one address as related to another. Where there is no relation, an absolute
  749. * address is returned.
  750. *
  751. * On entry,
  752. * Both names must be absolute, fully qualified names of nodes
  753. * (no anchor bits)
  754. *
  755. * On exit,
  756. * The return result points to a newly allocated name which, if
  757. * parsed by HTParse relative to relatedName, will yield aName.
  758. * The caller is responsible for freeing the resulting name later.
  759. *
  760. */
  761. char *HTRelative(const char *aName,
  762. const char *relatedName)
  763. {
  764. char *result = NULL;
  765. const char *p = aName;
  766. const char *q = relatedName;
  767. const char *after_access = NULL;
  768. const char *path = NULL;
  769. const char *last_slash = NULL;
  770. int slashes = 0;
  771. for (; *p; p++, q++) { /* Find extent of match */
  772. if (*p != *q)
  773. break;
  774. if (*p == ':')
  775. after_access = p + 1;
  776. if (*p == '/') {
  777. last_slash = p;
  778. slashes++;
  779. if (slashes == 3)
  780. path = p;
  781. }
  782. }
  783. /* q, p point to the first non-matching character or zero */
  784. if (!after_access) { /* Different access */
  785. StrAllocCopy(result, aName);
  786. } else if (slashes < 3) { /* Different nodes */
  787. StrAllocCopy(result, after_access);
  788. } else if (slashes == 3) { /* Same node, different path */
  789. StrAllocCopy(result, path);
  790. } else { /* Some path in common */
  791. int levels = 0;
  792. for (; *q && (*q != '#'); q++)
  793. if (*q == '/')
  794. levels++;
  795. result = typecallocn(char, 3 * levels + strlen(last_slash) + 1);
  796. if (result == NULL)
  797. outofmem(__FILE__, "HTRelative");
  798. result[0] = '\0';
  799. for (; levels; levels--)
  800. strcat(result, "../");
  801. strcat(result, last_slash + 1);
  802. }
  803. CTRACE((tfp,
  804. "HTparse: `%s' expressed relative to\n `%s' is\n `%s'.\n",
  805. aName, relatedName, result));
  806. return result;
  807. }
  808. /* Escape undesirable characters using % HTEscape()
  809. * -------------------------------------
  810. *
  811. * This function takes a pointer to a string in which
  812. * some characters may be unacceptable unescaped.
  813. * It returns a string which has these characters
  814. * represented by a '%' character followed by two hex digits.
  815. *
  816. * Unlike HTUnEscape(), this routine returns a calloc'd string.
  817. */
  818. /* *INDENT-OFF* */
  819. static const unsigned char isAcceptable[96] =
  820. /* Bit 0 xalpha -- see HTFile.h
  821. * Bit 1 xpalpha -- as xalpha but with plus.
  822. * Bit 2 ... path -- as xpalphas but with /
  823. */
  824. /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
  825. { 0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4, /* 2x !"#$%&'()*+,-./ */
  826. 7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x 0123456789:;<=>? */
  827. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 4x @ABCDEFGHIJKLMNO */
  828. 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7, /* 5X PQRSTUVWXYZ[\]^_ */
  829. 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 6x `abcdefghijklmno */
  830. 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 }; /* 7X pqrstuvwxyz{|}~ DEL */
  831. /* *INDENT-ON* */
  832. static const char *hex = "0123456789ABCDEF";
  833. #define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & mask))
  834. char *HTEscape(const char *str,
  835. unsigned char mask)
  836. {
  837. const char *p;
  838. char *q;
  839. char *result;
  840. int unacceptable = 0;
  841. for (p = str; *p; p++)
  842. if (!ACCEPTABLE(UCH(TOASCII(*p))))
  843. unacceptable++;
  844. result = typecallocn(char, p - str + unacceptable + unacceptable + 1);
  845. if (result == NULL)
  846. outofmem(__FILE__, "HTEscape");
  847. for (q = result, p = str; *p; p++) {
  848. unsigned char a = TOASCII(*p);
  849. if (!ACCEPTABLE(a)) {
  850. *q++ = HEX_ESCAPE; /* Means hex coming */
  851. *q++ = hex[a >> 4];
  852. *q++ = hex[a & 15];
  853. } else
  854. *q++ = *p;
  855. }
  856. *q++ = '\0'; /* Terminate */
  857. return result;
  858. }
  859. /* Escape unsafe characters using % HTEscapeUnsafe()
  860. * --------------------------------
  861. *
  862. * This function takes a pointer to a string in which
  863. * some characters may be that may be unsafe are unescaped.
  864. * It returns a string which has these characters
  865. * represented by a '%' character followed by two hex digits.
  866. *
  867. * Unlike HTUnEscape(), this routine returns a malloc'd string.
  868. */
  869. #define UNSAFE(ch) (((ch) <= 32) || ((ch) >= 127))
  870. char *HTEscapeUnsafe(const char *str)
  871. {
  872. const char *p;
  873. char *q;
  874. char *result;
  875. int unacceptable = 0;
  876. for (p = str; *p; p++)
  877. if (UNSAFE(UCH(TOASCII(*p))))
  878. unacceptable++;
  879. result = typecallocn(char, p - str + unacceptable + unacceptable + 1);
  880. if (result == NULL)
  881. outofmem(__FILE__, "HTEscapeUnsafe");
  882. for (q = result, p = str; *p; p++) {
  883. unsigned char a = TOASCII(*p);
  884. if (UNSAFE(a)) {
  885. *q++ = HEX_ESCAPE; /* Means hex coming */
  886. *q++ = hex[a >> 4];
  887. *q++ = hex[a & 15];
  888. } else
  889. *q++ = *p;
  890. }
  891. *q++ = '\0'; /* Terminate */
  892. return result;
  893. }
  894. /* Escape undesirable characters using % but space to +. HTEscapeSP()
  895. * -----------------------------------------------------
  896. *
  897. * This function takes a pointer to a string in which
  898. * some characters may be unacceptable unescaped.
  899. * It returns a string which has these characters
  900. * represented by a '%' character followed by two hex digits,
  901. * except that spaces are converted to '+' instead of %2B.
  902. *
  903. * Unlike HTUnEscape(), this routine returns a calloced string.
  904. */
  905. char *HTEscapeSP(const char *str,
  906. unsigned char mask)
  907. {
  908. const char *p;
  909. char *q;
  910. char *result;
  911. int unacceptable = 0;
  912. for (p = str; *p; p++)
  913. if (!(*p == ' ' || ACCEPTABLE(UCH(TOASCII(*p)))))
  914. unacceptable++;
  915. result = typecallocn(char, p - str + unacceptable + unacceptable + 1);
  916. if (result == NULL)
  917. outofmem(__FILE__, "HTEscape");
  918. for (q = result, p = str; *p; p++) {
  919. unsigned char a = TOASCII(*p);
  920. if (a == 32) {
  921. *q++ = '+';
  922. } else if (!ACCEPTABLE(a)) {
  923. *q++ = HEX_ESCAPE; /* Means hex coming */
  924. *q++ = hex[a >> 4];
  925. *q++ = hex[a & 15];
  926. } else {
  927. *q++ = *p;
  928. }
  929. }
  930. *q++ = '\0'; /* Terminate */
  931. return result;
  932. }
  933. /* Decode %xx escaped characters. HTUnEscape()
  934. * ------------------------------
  935. *
  936. * This function takes a pointer to a string in which some
  937. * characters may have been encoded in %xy form, where xy is
  938. * the ASCII hex code for character 16x+y.
  939. * The string is converted in place, as it will never grow.
  940. */
  941. static char from_hex(char c)
  942. {
  943. return (char) (c >= '0' && c <= '9' ? c - '0'
  944. : c >= 'A' && c <= 'F' ? c - 'A' + 10
  945. : c - 'a' + 10); /* accept small letters just in case */
  946. }
  947. char *HTUnEscape(char *str)
  948. {
  949. char *p = str;
  950. char *q = str;
  951. if (!(p && *p))
  952. return str;
  953. while (*p != '\0') {
  954. if (*p == HEX_ESCAPE &&
  955. /*
  956. * Tests shouldn't be needed, but better safe than sorry.
  957. */
  958. p[1] && p[2] &&
  959. isxdigit(UCH(p[1])) &&
  960. isxdigit(UCH(p[2]))) {
  961. p++;
  962. if (*p)
  963. *q = (char) (from_hex(*p++) * 16);
  964. if (*p) {
  965. /*
  966. * Careful! FROMASCII() may evaluate its arg more than once!
  967. */
  968. /* S/390 -- gil -- 0221 */
  969. *q = (char) (*q + from_hex(*p++));
  970. }
  971. *q = FROMASCII(*q);
  972. q++;
  973. } else {
  974. *q++ = *p++;
  975. }
  976. }
  977. *q++ = '\0';
  978. return str;
  979. } /* HTUnEscape */
  980. /* Decode some %xx escaped characters. HTUnEscapeSome()
  981. * ----------------------------------- Klaus Weide
  982. * (kweide@tezcat.com)
  983. * This function takes a pointer to a string in which some
  984. * characters may have been encoded in %xy form, where xy is
  985. * the ASCII hex code for character 16x+y, and a pointer to
  986. * a second string containing one or more characters which
  987. * should be unescaped if escaped in the first string.
  988. * The first string is converted in place, as it will never grow.
  989. */
  990. char *HTUnEscapeSome(char *str,
  991. const char *do_trans)
  992. {
  993. char *p = str;
  994. char *q = str;
  995. char testcode;
  996. if (p == NULL || *p == '\0' || do_trans == NULL || *do_trans == '\0')
  997. return str;
  998. while (*p != '\0') {
  999. if (*p == HEX_ESCAPE &&
  1000. p[1] && p[2] && /* tests shouldn't be needed, but.. */
  1001. isxdigit(UCH(p[1])) &&
  1002. isxdigit(UCH(p[2])) &&
  1003. (testcode = (char) FROMASCII(from_hex(p[1]) * 16 +
  1004. from_hex(p[2]))) && /* %00 no good */
  1005. strchr(do_trans, testcode)) { /* it's one of the ones we want */
  1006. *q++ = testcode;
  1007. p += 3;
  1008. } else {
  1009. *q++ = *p++;
  1010. }
  1011. }
  1012. *q++ = '\0';
  1013. return str;
  1014. } /* HTUnEscapeSome */
  1015. /* *INDENT-OFF* */
  1016. static const unsigned char crfc[96] =
  1017. /* Bit 0 xalpha -- need "quoting"
  1018. * Bit 1 xpalpha -- need \escape if quoted
  1019. */
  1020. /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
  1021. { 1,0,3,0,0,0,0,0,1,1,0,0,1,0,1,0, /* 2x !"#$%&'()*+,-./ */
  1022. 0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0, /* 3x 0123456789:;<=>? */
  1023. 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 4x @ABCDEFGHIJKLMNO */
  1024. 0,0,0,0,0,0,0,0,0,0,0,1,2,1,0,0, /* 5X PQRSTUVWXYZ[\]^_ */
  1025. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 6x `abcdefghijklmno */
  1026. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3 }; /* 7X pqrstuvwxyz{|}~ DEL */
  1027. /* *INDENT-ON* */
  1028. #define ASCII_TAB '\011'
  1029. #define ASCII_LF '\012'
  1030. #define ASCII_CR '\015'
  1031. #define ASCII_SPC '\040'
  1032. #define ASCII_BAK '\134'
  1033. /*
  1034. * Turn a string which is not a RFC 822 token into a quoted-string. - KW
  1035. * The "quoted" parameter tells whether we need the beginning/ending quote
  1036. * marks. If not, the caller will provide them -TD
  1037. */
  1038. void HTMake822Word(char **str,
  1039. int quoted)
  1040. {
  1041. const char *p;
  1042. char *q;
  1043. char *result;
  1044. unsigned char a;
  1045. int added = 0;
  1046. if (isEmpty(*str)) {
  1047. StrAllocCopy(*str, quoted ? "\"\"" : "");
  1048. return;
  1049. }
  1050. for (p = *str; *p; p++) {
  1051. a = TOASCII(*p); /* S/390 -- gil -- 0240 */
  1052. if (a < 32 || a >= 128 ||
  1053. ((crfc[a - 32]) & 1)) {
  1054. if (!added)
  1055. added = 2;
  1056. if (a >= 160 || a == '\t')
  1057. continue;
  1058. if (a == '\r' || a == '\n')
  1059. added += 2;
  1060. else if ((a & 127) < 32 || ((crfc[a - 32]) & 2))
  1061. added++;
  1062. }
  1063. }
  1064. if (!added)
  1065. return;
  1066. result = typecallocn(char, p - (*str) + added + 1);
  1067. if (result == NULL)
  1068. outofmem(__FILE__, "HTMake822Word");
  1069. q = result;
  1070. if (quoted)
  1071. *q++ = '"';
  1072. /*
  1073. * Having converted the character to ASCII, we can't use symbolic
  1074. * escape codes, since they're in the host character set, which
  1075. * is not necessarily ASCII. Thus we use octal escape codes instead.
  1076. * -- gil (Paul Gilmartin) <pg@sweng.stortek.com>
  1077. */
  1078. /* S/390 -- gil -- 0268 */
  1079. for (p = *str; *p; p++) {
  1080. a = TOASCII(*p);
  1081. if ((a != ASCII_TAB) &&
  1082. ((a & 127) < ASCII_SPC ||
  1083. (a < 128 && ((crfc[a - 32]) & 2))))
  1084. *q++ = ASCII_BAK;
  1085. *q++ = *p;
  1086. if (a == ASCII_LF ||
  1087. (a == ASCII_CR && (TOASCII(*(p + 1)) != ASCII_LF)))
  1088. *q++ = ' ';
  1089. }
  1090. if (quoted)
  1091. *q++ = '"';
  1092. *q++ = '\0'; /* Terminate */
  1093. FREE(*str);
  1094. *str = result;
  1095. }