HTML to SQLite Example

There are instances where so-called "web APIs" returning JSON do not offer as much information as webpage HTML. Example: HN webpages indicate which items are [flagged] or labeled as [dupe] but the "HN API" JSON does not. Using a UNIX filter we can create our own "API" by transforming the HTML to SQLite and querying data from HN webpages as an SQLite database.

Reference: https://news.ycombinator.com/item?id=42923154

2 points | by 1vuio0pswjnm7 8 hours ago

5 comments

  • 1vuio0pswjnm7 7 hours ago
    .

       flex -8Cem 095.l
    
       cc -O3 -std=c89 -pedantic -W -Wall lex.yy.c -pipe -static -s -o yy095
       
       curl -A"" -s https://news.ycombinator.com|yy095|sqlite3 0.db
       
       curl -A"" -s https://news.ycombinator.com/newest|yy095|sqlite3 0.db
    
    Example usage, assuming "the" is a keyword/string of interest

       # get count of stories with keyword/string
       echo select count\(\) from t1 where title like \'%the%\'|sqlite3 0.db
    
       # get count of all stories with keyword/string in the title that are flagged
       echo select count\(id\) from t1 where title like \'%the%\' and status like \'%[flagged]%\'|sqlite3 0.db
    
       # get list of all stories with keyword/string in the title that are flagged
       echo select id,title from t1 where title like \'%the%\' and status like \'%[flagged]%\'|sqlite3 0.db
    
       # get status counts for stories with keyword/string in title
       echo select status from t1 where title like \'%the%\' order by status|sqlite3 0.db|uniq -c
    
         415 [dead]
          17 [dupe]
          12 [dupe] [dead]
          23 [flagged]
         131 [flagged] [dead]
        2320 [ok]
    
       # get status counts for stories with keyword/string in title for the last week
       x=$((`date +%s`-604800))  
       echo select status from t1 where title like \'%the%\' and time \> $x order by status|sqlite3 0.db|uniq -c
  • 1vuio0pswjnm7 7 hours ago
    Part 2/2

        /*```````````title```````````*/
       <MODE1>\">[^<]+ if(z){
        z=0;
        for(x=2;x<yyleng;x++)
        putc(yytext[x],yyout);
        fwrite("',",1,2,yyout);
        }
       <MODE1>class=\"subline\" w++;
        /*```````````score```````````*/
       <MODE1>id=\"score_[0-9]+\">[^ ]+ {
        for(x=y+12;x<yyleng;x++)
        putc(yytext[x],yyout);
        putc(44,yyout);
        }
        /*```````````user``````````` */
       <MODE1>id=[^\"]+\"[ ]class=\"hnuser\" {
        putc(39,yyout);
        for(x=3;x<yyleng-16;x++)
        putc(yytext[x],yyout);
        fwrite("',",1,2,yyout);
        }
        /*```````````time```````````*/
       <MODE1>class=\"age\"[ ]title=\"[^ ]+ {
        if(!w)fwrite("0,'pg',",1,7,yyout);
        y=0;for(x=19;x<yyleng;x++)
        {yytext[y]=yytext[x];y++;}
        yytext[yyleng]=0;
        strptime(yytext,"%Y-%m-%dT%H:%M:%S UTC %Y",&t);
        printf("%ld,",mktime(&t));
        if(!w)fwrite("0);\n",1,4,yyout);
        }
        /*```````````comments```````````*/
       <MODE1>class=\"hnpast\">past<\/a>[ ][ ] fwrite("0);\n",1,4,yyout);
       <MODE1>item\?id=[0-9]+\">discuss fwrite("0);\n",1,4,yyout); 
       <MODE1>item\?id=[0-9]+\">[0-9]+&nbsp; {
        for(x=8;x<yyleng;x++)
        if(yytext[x]==62)break; 
        for(x=x+1;x<yyleng-6;x++)
        putc(yytext[x],yyout);
        fwrite(") ON CONFLICT(id) DO UPDATE SET ",1,32,yyout);
        fwrite("status=excluded.status,score=exc",1,32,yyout);
        fwrite("luded.score,comments=excluded.co",1,32,yyout);
        fwrite("mments,title=excluded.title;\n",1,29,stdout);
        }
       .|\n
       %%
        int main(int argc, char *argv[]){
        if(argv)
        if(argc==1)
        {
        fwrite("PRAGMA foreign_keys=OFF;\n",1,25,stdout);
        fwrite("BEGIN TRANSACTION;\n",1,19,stdout);
        fwrite("CREATE TABLE IF NOT EXISTS t1(id",1,32,stdout);
        fwrite(" INTEGER PRIMARY KEY,status TEXT",1,32,stdout);
        fwrite(",url TEXT,title TEXT,score INTEG",1,32,stdout);
        fwrite("ER,user TEXT,time INTEGER,commen",1,32,stdout);
        fwrite("ts INTEGER);\n",1,13,stdout);
        }
        yylex();
        if(argc==1)fwrite("COMMIT;\n",1,8,stdout);
        exit(0);}
       
        /*
        non-indented lines: 12-14,16,22,28,29,30,36,38,56,61,66,71,76,81,88,94,96,102,109,119-121,131,132
         */
  • 1vuio0pswjnm7 8 hours ago
    Part 1/2

        /* html to sql example */
        #define jmp (yy_start) = 1 + 2 *
        int fileno (FILE *);
        FILE *f;
        int printf(const char *__restrict, ...);
        int w,x,y,z;
        int setenv (const char *, const char *, int);
        char text[1024];
        #include <time.h>
        char *strptime(const char *s, const char *f, struct tm *tm);
        struct tm t;
       %option nounput noinput noyywrap
       %s MODE1 
       %%
        /*```````````input:1-column list```````````*/
       ^[0-9]+\n {
        fwrite("https://hacker-news.firebaseio.com/v0/item/",1,43,yyout);
        fwrite(yytext,1,yyleng-1,yyout);
        fwrite(".json\n",1,6,yyout);
        }
        /*```````````input:json```````````*/
       \"id\":[0-9]+ {
        fwrite("https://hacker-news.firebaseio.com/v0/item/",1,43,yyout);
        for(x=5;x<yyleng;x++)
        putc(yytext[x],yyout);
        fwrite(".json\n",1,6,yyout);
        }
       \"kids\":\[ z++;
       \] if(z)z=0;
       [0-9]+ if(z){
        fwrite("https://hacker-news.firebaseio.com/v0/item/",1,43,yyout);
        fwrite(yytext,1,yyleng,yyout);
        fwrite(".json\n",1,6,yyout);
        }
        /*```````````input:html```````````*/
       \74html[ ]lang= jmp MODE1;
        /*```````````id```````````*/
       <MODE1>[ ]id='[0-9]+' {
        fwrite("INSERT or IGNORE INTO t1 VALUES(",1,32,stdout);
        for(x=5;x<yyleng-1;x++)
        putc(yytext[x],yyout);
        y=yyleng-6;
        fwrite(",'",1,2,yyout);
        w=0;
        }
        /*```````````status```````````*/
        /* 1 [ok] */
        /* 2 \[dupe\] */
        /* 3 \[dead\] */
        /* 4 \[flagged\] */
        /* 5 \[dupe\][ ]{2}\[dead\] */
        /* 6 \[flagged\][ ]{2}\[dead\] */
        /* 7 \[flagged\][ ]{2}\[dupe\] omit */
        /* 8 \[flagged\][ ]{2}\[dupe\][ ]\[dead\] omit */
        /*```````````url```````````*/
       <MODE1>class=\"titleline\">[ ]\[flagged\][ ]{2}\[dead\][ ]<a[ ]href=\"[^\"]+ if(!z){
        fwrite("[flagged] [dead]','",1,19,yyout);
        z=6;for(x=46;x<yyleng;x++)putc(yytext[x],yyout);
        fwrite("','",1,3,yyout);
        }
       <MODE1>class=\"titleline\">[ ]\[dupe\][ ]{2}\[dead\][ ]<a[ ]href=\"[^\"]+ if(!z){
        fwrite("[dupe] [dead]','",1,16,yyout);
        z=5;for(x=43;x<yyleng;x++)putc(yytext[x],yyout);
        fwrite("','",1,3,yyout);
        }
       <MODE1>class=\"titleline\">[ ]\[flagged\][ ]<a[ ]href=\"[^\"]+ if(!z){
        fwrite("[flagged]','",1,12,yyout);
        z=4;for(x=38;x<yyleng;x++)putc(yytext[x],yyout);
        fwrite("','",1,3,yyout);
        }
       <MODE1>class=\"titleline\">[ ]\[dead\][ ]<a[ ]href=\"[^\"]+ if(!z){
        fwrite("[dead]','",1,9,yyout);
        z=3;for(x=35;x<yyleng;x++)putc(yytext[x],yyout);
        fwrite("','",1,3,yyout);
        }
       <MODE1>class=\"titleline\">[ ]\[dupe\][ ]<a[ ]href=\"[^\"]+ if(!z){
        fwrite("[dupe]','",1,9,yyout);
        z=2;for(x=35;x<yyleng;x++)putc(yytext[x],yyout);
        fwrite("','",1,3,yyout);
        }
       <MODE1>class=\"titleline\"><a[ ]href=\"[^\"]+ if(!z){
        fwrite("[ok]','",1,7,yyout);
        z=1;for(x=27;x<yyleng;x++)
        putc(yytext[x],yyout);
        fwrite("','",1,3,yyout);
        }
    • 1vuio0pswjnm7 5 hours ago
      No need for printf function or text buffer.