2017-03-18 11 views
1

웹 사이트 및 페이지의 모든 이미지에서 페이지를 가져 오는 (꽤 못생긴) 방법이 있습니다. 웹 페이지를 가져 오는 것은 전혀 문제가되지 않습니다. 그러나 내가 이미지를 가져올 때, 그들은 이상한 곳에서 나왔다. 테스트를 위해 사용하고있는 uri는 다음과 같습니다. http://www.themountaingoats.net/contact.html이 웹 페이지는 매우 간단하며 테스트에 필요한 모든 것이 있습니다.동일한 소켓 연결을 통해 이미지와 텍스트를 보내려면 어떻게합니까

끝 줄 문자로 \ r 또는 \ n을 사용하면 결과가 달라져 \ r \ n 이미지를 열 수도 없습니다. enter image description here

enter image description here

enter image description here

public static String GET(String uri, int port) throws IOException { 

     String domain = uri.split("/",2)[0]; 
     String filename = uri.split("/",2)[1]; 
     Socket socket = new Socket(domain, port); 


     // send the command to the server. 
     System.out.println(socket.isConnected()); 
     DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
     BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
     String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(request); 
     outToServer.writeBytes(request); 

     //create a file to write in. 
     File file = new File(domain+".txt"); 
     // if file doesnt exists, then create it 
     if (!file.exists()) { 
      file.createNewFile(); 
     } 
     PrintWriter writer = new PrintWriter(file); 
     writer.print(""); 
     writer.close(); 

     int characterCounter=100; 
     while(characterCounter >= 0){ 
      String serverSentence = inFromServer.readLine(); 
      System.out.println(serverSentence); 
      if (serverSentence.startsWith("Content-Length:")){ 
       characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
      } 
      if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
        && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
        && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
       characterCounter = characterCounter - serverSentence.length()-1; 
      } 

      //write in the file 
      FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
      BufferedWriter bw = new BufferedWriter(fw); 
      bw.write(serverSentence+"\r\n"); 
      bw.close(); 
     } 


     Document doc = Jsoup.parse(file, "UTF-8"); 
     Elements imgs = doc.getElementsByTag("img"); 

     System.out.println(imgs); 


     for (Element link : imgs) { 
      String source = link.attr("src"); 

      source = source.replace("http://"+domain+"", ""); 

      System.out.println(source); 


      //create a file to write in. 
      File image = new File(source.replace("/", ".")); 
      // if file doesnt exists, then create it 
      if (!image.exists()) { 
       image.createNewFile(); 
      } 

      PrintWriter imageWriter = new PrintWriter(image); 
      imageWriter.print(""); 
      imageWriter.close(); 

      String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
      System.out.println(requestImage); 
      outToServer.writeBytes(requestImage); 

      boolean flag = false; 
      String previousServerSentence = "something not empty"; 
      characterCounter=100; 
      while(characterCounter > 0){ 
       String serverSentence = inFromServer.readLine(); 
       System.out.println(serverSentence); 
       if (serverSentence.startsWith("Content-Length:")){ 
        characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
       } 

       if (!flag){ 
        if (previousServerSentence.matches("") && !serverSentence.matches("")){ 
         flag = true; 
        } 
       } 

       if ((!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
         && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
         && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") && !serverSentence.startsWith("ETag: ") && !serverSentence.startsWith("Accept-Ranges: ") 
         && !serverSentence.startsWith("Accept-Language: ") && !serverSentence.startsWith("Accept-Datetime: ") && !serverSentence.startsWith("Authorization: ") 
         && !serverSentence.startsWith("Connection: ") && !serverSentence.startsWith("Content-Language: ") && !serverSentence.startsWith("Content-Length: ") 
         && !serverSentence.startsWith("Content-Location: ") && !serverSentence.startsWith("Content-MD5: ") && !serverSentence.startsWith("Content-Range: ") 
         && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("expect: ") 
         && !serverSentence.startsWith("From: ") && !serverSentence.startsWith("Host: ") && !serverSentence.startsWith("If-Match: ") && !serverSentence.startsWith("If-Modified-Since: ") 
         && !serverSentence.startsWith("Accept: ") && !serverSentence.startsWith("Accept-Charset: ") && !serverSentence.startsWith("Accept-Encoding: ") 
         && !serverSentence.startsWith("Age: ") && !serverSentence.startsWith("Allow: ") && !serverSentence.startsWith("Content-Encoding: ") 
         && !serverSentence.startsWith("If-None-Match: ") && !serverSentence.startsWith("If-Range: ") && !serverSentence.startsWith("If-Unmodified-Since: ") 
         && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Location: ") && !serverSentence.startsWith("Max-Forwards: ") 
         && !serverSentence.startsWith("Pragma: ") && !serverSentence.startsWith("Proxy-Authenticate: ") && !serverSentence.startsWith("Proxy-Authorization: ") 
         && !serverSentence.startsWith("Range: ") && !serverSentence.startsWith("Referer: ") && !serverSentence.startsWith("Retry-After: ") 
         && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("TE: ") && !serverSentence.startsWith("Trailer: ") 
         && !serverSentence.startsWith("Transfer-Encoding: ") && !serverSentence.startsWith("Upgrade: ") && !serverSentence.startsWith("User-Agent: ") 
         && !serverSentence.startsWith("Via: ") && !serverSentence.startsWith("Warning: ") && !serverSentence.startsWith("WWW-Authenticate: ")) 
         && flag){ 
        characterCounter = characterCounter - serverSentence.length()-1; 
        //write in the file 

        FileWriter fw = new FileWriter(image.getAbsoluteFile(),true); 
        BufferedWriter bw = new BufferedWriter(fw); 
        bw.write(serverSentence+"\r"); 
        bw.close(); 


       } 

       previousServerSentence = serverSentence; 
      } 


     } 
     return null; 
    } 
첫번째 이미지가 endline 같은 \ r을위한 두 번째 이미지 endline 같은 \ n을위한 마지막 이미지가 원래의 하나이다. 이미지가 왜 그렇게 엉망이되는지 나는 전혀 모른다.

제 질문은 : 왜 이런 일이 일어나고 어떻게 수정합니까?

편집 :

enter image description here

내가 얻을 수있는 그림의 일부가 아니라 전체 그림 :

public static String GET(String uri, int port) throws IOException { 

     /* 
     * Retrieval of the webpage 
     */ 

     String domain = uri.split("/",2)[0]; 
     String filename = uri.split("/",2)[1]; 
     Socket socket = new Socket(domain, port); 


     // send the command to the server. 
     System.out.println(socket.isConnected()); 
     DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
     BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
     String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(request); 
     outToServer.writeBytes(request); 

     //create a file to write in. 
     File file = new File(domain+".txt"); 
     // if file doesnt exists, then create it 
     if (!file.exists()) { 
      file.createNewFile(); 
     } 
     PrintWriter writer = new PrintWriter(file); 
     writer.print(""); 
     writer.close(); 

     int characterCounter=100; 
     while(characterCounter >= 0){ 
      String serverSentence = inFromServer.readLine(); 
      System.out.println(serverSentence); 
      if (serverSentence.startsWith("Content-Length:")){ 
       characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
      } 
      if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
        && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
        && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
       characterCounter = characterCounter - serverSentence.length()-1; 
      } 

      //write in the file 
      FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
      BufferedWriter bw = new BufferedWriter(fw); 
      bw.write(serverSentence+"\r\n"); 
      bw.close(); 
     } 

     /* 
     * Retrieval of all the embedded images on the webpage that are on the same domain. 
     */ 

     Document doc = Jsoup.parse(file, "UTF-8"); 
     Elements imgs = doc.getElementsByTag("img"); 

     System.out.println(imgs); 



     for (Element link : imgs) { 
      String source = link.attr("src"); 

      source = source.replace("http://"+domain+"", ""); 

      System.out.println(source); 

      //create a file to write in. 
      File image = new File(source.replace("/", ".")); 
      // if file doesnt exists, then create it 
      if (!image.exists()) { 
       image.createNewFile(); 
      } 

      // Initialize the streams. 
      final FileOutputStream fileOutputStream = new FileOutputStream(image); 
      final InputStream inputStream = socket.getInputStream(); 

      // Header end flag. 
      boolean headerEnded = false; 

      String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
      System.out.println(requestImage); 
      outToServer.writeBytes(requestImage); 

      int buffersize = 1000000; 
      byte[] bytes = new byte[buffersize]; 
      int length; 

      while ((length = inputStream.read(bytes)) != -1) { 
       // If the end of the header had already been reached, write the bytes to the file as normal. 
       if (headerEnded){ 

        fileOutputStream.write(bytes, 0, length); 
       } 
       // This locates the end of the header by comparing the current byte as well as the next 3 bytes 
       // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10). 
       // If the end of the header is reached, the flag is set to true and the remaining data in the 
       // currently buffered byte array is written into the file. 
       else { 
        for (int i = 0; i < buffersize-3; i++) { 
         if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) { 
          headerEnded = true; 
          fileOutputStream.write(bytes, i+4 , buffersize-i-4); 
          break; 
         } 
        } 
       } 
      } 

      inputStream.close(); 
      fileOutputStream.close(); 

     } 
     socket.close(); 
     return null; 
    } 

이 지금 내 결과입니다. 버퍼 사이즈를 가지고 노는 것은 조금 더 멀리 또는 조금 더 멀어지게 만듭니다.

EDIT2 : 오류가 발생했습니다. 단지 몇 가지 차원과 관련이 있습니다. 최종 작업 코드 : 가능하면 HTTP 요청을 처리하기 위해 원시 소켓을 사용

public static String GET(String uri, int port) throws IOException { 

    /* 
    * Retrieval of the webpage 
    */ 

    String domain = uri.split("/",2)[0]; 
    String filename = uri.split("/",2)[1]; 
    Socket socket = new Socket(domain, port); 


    // send the command to the server. 
    System.out.println(socket.isConnected()); 
    DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
    BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
    String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
    System.out.println(request); 
    outToServer.writeBytes(request); 

    //create a file to write in. 
    File file = new File(domain+".txt"); 
    // if file doesnt exists, then create it 
    if (!file.exists()) { 
     file.createNewFile(); 
    } 
    PrintWriter writer = new PrintWriter(file); 
    writer.print(""); 
    writer.close(); 

    int characterCounter=100; 
    while(characterCounter >= 0){ 
     String serverSentence = inFromServer.readLine(); 
     System.out.println(serverSentence); 
     if (serverSentence.startsWith("Content-Length:")){ 
      characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
     } 
     if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
       && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
       && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
      characterCounter = characterCounter - serverSentence.length()-1; 
     } 

     //write in the file 
     FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
     BufferedWriter bw = new BufferedWriter(fw); 
     bw.write(serverSentence+"\r\n"); 
     bw.close(); 
    } 

    /* 
    * Retrieval of all the embedded images on the webpage that are on the same domain. 
    */ 

    Document doc = Jsoup.parse(file, "UTF-8"); 
    Elements imgs = doc.getElementsByTag("img"); 

    System.out.println(imgs); 


    for (Element link : imgs) { 

     // Getting the link ready for GET query. 

     String source = link.attr("src"); 

     source = source.replace("http://"+domain+"", ""); 

     System.out.println(source); 

     //create a file to write in. 
     File image = new File(source.replace("/", ".")); 
     // if file doesnt exists, then create it 
     if (!image.exists()) { 
      image.createNewFile(); 
     } 

     String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(requestImage); 
     outToServer.writeBytes(requestImage); 

     // Initialize the streams. 
     final FileOutputStream fileOutputStream = new FileOutputStream(image); 
     final InputStream inputStream = socket.getInputStream(); 

     // Header end flag. 
     boolean headerEnded = false; 

     int buffersize = 10000; 
     byte[] bytes = new byte[buffersize]; 
     int length; 
     while ((length = inputStream.read(bytes)) != -1) { 
      // If the end of the header had already been reached, write the bytes to the file as normal. 
      if (headerEnded){ 
       fileOutputStream.write(bytes, 0, length); 
      } 
      // This locates the end of the header by comparing the current byte as well as the next 3 bytes 
      // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10). 
      // If the end of the header is reached, the flag is set to true and the remaining data in the 
      // currently buffered byte array is written into the file. 
      else { 
       for (int i = 0; i < length-3; i++) { 
        if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) { 
         headerEnded = true; 
         fileOutputStream.write(bytes, i+4 , length-i-4); 
         break; 
        } 
       } 
      } 
     } 

     inputStream.close(); 
     fileOutputStream.close(); 

    } 
    socket.close(); 
    return null; 
} 
+0

가 대신 URLConnection의의를위한 소켓을 사용해야합니까 : 해결 방법에 대한

참조 Alexay의 대답은? – JCasso

+0

이것은 할당 용이며 우리는 소켓을 사용해야하며 UrlConnection과 같은 것을 사용하여 데이터를 검색 할 수 없습니다 – Sander

답변

1

마십시오. 당신은 이미지 파일을 가져 오지하기 위해 별도의 연결을 사용할 수 있는지

참조 4ndrew의 대답은 : 당신은 원시 소켓에 붙어있는 경우 https://stackoverflow.com/a/8679160/176873

, 다음 java.io.BufferedReader의을 사용하지 마십시오. BufferedReader는 이진 데이터를 읽는 데 사용하면 안됩니다. 바이너리 데이터를 문자열으로 변환하고 로컬 PC에 텍스트 파일을 쓰는 중입니다. https://stackoverflow.com/a/34106534/176873

+0

두 번째 링크가 거의 정상적으로 작동하는 것 같았습니다 : 이제 부분적으로 이미지를 다운로드 할 수 있습니다. 아무리 큰 버퍼를 설정해도 이미지의 처음 몇 줄만 가져올 수 있습니다. 방금 버퍼 크기를 너무 크게 설정하면 줄 수가 줄어들 것입니다. 변경된 코드는 내 질문의 편집에 있습니다 – Sander

+0

나는 사상가에게 조금씩해야했지만 해결책을 찾았 기 때문에 답을 받아 들였습니다. 고맙습니다. – Sander